Testes/Sentiment%20Analysis.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BIBLIOTECA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "import statistics as st\n",
    "from statistics import mode, mean\n",
    "from scipy.stats import spearmanr, pearsonr, skew, kendalltau, norm\n",
    "from collections import Counter\n",
    "from datetime import datetime\n",
    "from keras.models import load_model\n",
    "import warnings\n",
    "#!pip install wordcloud\n",
    "import string\n",
    "from wordcloud import WordCloud\n",
    "from xgboost import XGBClassifier\n",
    "import nltk\n",
    "import torch\n",
    "from torch import nn\n",
    "from torch.utils.data import DataLoader, Dataset\n",
    "#nltk.download('all')\n",
    "import spacy\n",
    "import pickle \n",
    "from sklearn.datasets import load_files, make_classification\n",
    "from docx import Document\n",
    "import os\n",
    "import re\n",
    "import pdfplumber\n",
    "import gensim\n",
    "#!pip install gensim\n",
    "from gensim.models import FastText\n",
    "from nltk.corpus import stopwords\n",
    "from joblib import Parallel, delayed\n",
    "from nltk.tokenize import word_tokenize\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from textblob import TextBlob\n",
    "from deep_translator import GoogleTranslator\n",
    "from pyannote.audio import Pipeline\n",
    "from transformers import pipeline\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from reportlab.lib.pagesizes import A4\n",
    "from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Image\n",
    "from reportlab.lib import colors\n",
    "from PIL import Image, ImageOps\n",
    "from tkinter import filedialog\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid\n",
    "import tabula\n",
    "import PyPDF2\n",
    "import pdfplumber\n",
    "from datetime import datetime\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.svm import SVC\n",
    "from imblearn.combine import SMOTEENN\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import joblib\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
    "import fitz\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, classification_report, f1_score, precision_score, recall_score, roc_auc_score\n",
    "from tqdm.notebook import tqdm\n",
    "from tensorflow.keras.models import Sequential, load_model\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Flatten, Dropout, BatchNormalization\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils import to_categorical\n",
    "from itertools import product\n",
    "#!python -m spacy download pt_core_news_sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## FUNÇÕES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def grafico(a):\n",
    "    if a == 1:\n",
    "        custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False}\n",
    "        sns.set_theme(style=\"ticks\", rc=custom_params, palette = \"pastel\")\n",
    "        palette = \"pastel\"\n",
    "    elif a == 2:\n",
    "        sns.set_theme(style=\"white\", palette = \"Set2\")\n",
    "        palette = \"Set2\"\n",
    "    elif a == 3:\n",
    "        sns.set_theme(style=\"whitegrid\", palette = \"pastel\")\n",
    "        palette = \"pastel\"\n",
    "    elif a == 4:\n",
    "        sns.set_theme()\n",
    "        palette = \"husl\"\n",
    "    return palette"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tratamento para portugues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
      "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
      "C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\3345901888.py:8: SyntaxWarning: invalid escape sequence '\\s'\n",
      "  df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '')\n"
     ]
    }
   ],
   "source": [
    "portuguesstopwords = set(stopwords.words('portuguese'))\n",
    "nlp = spacy.load(\"pt_core_news_sm\")\n",
    "def preprocess_text_column_pt(df, column_name):\n",
    "    if column_name not in df.columns:\n",
    "        raise ValueError(f\"A coluna '{column_name}' não existe no dataset.\")\n",
    "    df_copy = df.copy()\n",
    "    df_copy[column_name] = df_copy[column_name].str.lower() \n",
    "    df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '') \n",
    "    df_copy[column_name] = df_copy[column_name].str.strip() \n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True) \n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n",
    "    df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n",
    "    df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in portuguesstopwords]))\n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True)  \n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True)  \n",
    "    df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))\n",
    "    df_copy['TextoLema'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)])) \n",
    "\n",
    "    return df_copy['TextoLema']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tratamento para ingles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
      "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
      "C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\4039191732.py:8: SyntaxWarning: invalid escape sequence '\\s'\n",
      "  df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n"
     ]
    }
   ],
   "source": [
    "english_stopwords = set(stopwords.words('english'))\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "def preprocess_text_column(df, column_name):\n",
    "    if column_name not in df.columns:\n",
    "        raise ValueError(f\"The column '{column_name}' does not exist in the dataset.\")\n",
    "    df_copy = df.copy()\n",
    "    df_copy[column_name] = df_copy[column_name].str.lower()\n",
    "    df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True).str.strip()\n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n",
    "    df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True)\n",
    "    df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True)\n",
    "    df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in english_stopwords and len(word) > 2]))\n",
    "    df_copy['LemmatizedText'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_punct and not token.is_stop]))\n",
    "    return df_copy['LemmatizedText']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                Text    emotion\n",
      "0                                    That game hurt.    Sadness\n",
      "1                                 Man I love reddit.    Disgust\n",
      "2  So happy for [NAME]. So sad he's not here. Ima...    Sadness\n",
      "3  I just came home, what the fuck is this lineup...    Disgust\n",
      "4  By far the coolest thing I've seen on this thr...  Happiness\n"
     ]
    }
   ],
   "source": [
    "colunas = [\"Text\", \"emotion\"]\n",
    "dataset3 = pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoEmotions_parateste.xlsx\",names=colunas)\n",
    "print(dataset3.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Sadness', 'Disgust', 'Happiness', 'Surprise', 'Anger', 'Fear'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset3[\"emotion\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>V. Absolutos</th>\n",
       "      <th>V. Relativos (%)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>emotion</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Happiness</th>\n",
       "      <td>7907</td>\n",
       "      <td>20.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Anger</th>\n",
       "      <td>7844</td>\n",
       "      <td>20.50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Disgust</th>\n",
       "      <td>7678</td>\n",
       "      <td>20.07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sadness</th>\n",
       "      <td>6758</td>\n",
       "      <td>17.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Surprise</th>\n",
       "      <td>5144</td>\n",
       "      <td>13.45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fear</th>\n",
       "      <td>2927</td>\n",
       "      <td>7.65</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           V. Absolutos  V. Relativos (%)\n",
       "emotion                                  \n",
       "Happiness          7907             20.67\n",
       "Anger              7844             20.50\n",
       "Disgust            7678             20.07\n",
       "Sadness            6758             17.66\n",
       "Surprise           5144             13.45\n",
       "Fear               2927              7.65"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({'V. Absolutos':dataset3['emotion'].value_counts(), 'V. Relativos (%)':(dataset3['emotion'].value_counts()*100/dataset3.shape[0]).round(2)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\1100090595.py:3: FutureWarning: \n",
      "\n",
      "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
      "\n",
      "  sns.countplot(x=\"emotion\", data=dataset3,order= dataset3['emotion'].value_counts().index,palette=\"husl\")\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA9gAAAJICAYAAACaO0yGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACEgElEQVR4nOzdd1yV5f/H8fdhgwoKqLhw4EDNmTgqR66y1MRRZo4cZaWpmZqluVeKo1Ib7kpz5EgcaWnO1NScqZnmVkRBxcE8h98f/DzfTjgQbzwcfD0fDx/BdV/3dX8OXAHvc9/XfZuSk5OTBQAAAAAAHoqTvQsAAAAAACArIGADAAAAAGAAAjYAAAAAAAYgYAMAAAAAYAACNgAAAAAABiBgAwAAAABgAAI2AAAAAAAGIGADAAAAAGAAAjYAAHeRnJxs7xIAAIADIWADABxSnTrzZTKFWf85OYUpR45PVaXKt/rssz+UlGSx6V+kyNd6/fXVaR5/+fJj6tDh/v1ff321ihT5Ot3HuZvZsw/KZArTyZPX0rzPDz/8JX//KXJyCpOLy3g5O4/Xm2+ufeha7GHDhtM23987/fvppxP2LlOSFBubqGefXWD9mru4jJePz2c6ceKqvUsDADxiLvYuAACA9KpUKY+mTq0vSTKbkxUdHafVq//Re+/9qs2bz2rBgiZycjJJkpYufUne3u5pHnvChF1p6vfxxzXUs2flBy/+Pl58sZi2bWujfPmypXmf6tXz65dfWikx0Sw3N2d5e7upaNGchtf2KE2ZUk+VK+e947bSpf0ecTV35uRk0uef19WtW0lycXGSu7uzChf2VvbsbvYuDQDwiBGwAQAOy9vbTdWr57dpa9IkSMHBfurZc72+//6wXnutjCSpUqU7h7SHFRSUM0PGzZ3bS7lzez3QPgUL5lDBgjkypB57KVPGL9X3OLNxd3fRE0/ktncZAIBMgEvEAQBZTvfulVSgQHZ9+eU+a9t/L93+/vvDqlBhjjw9Jyl37ilq23alzp+/ISnl8vONG89q48azMpnCtGHDaesly199tU+FC38lb+/P9PPPJ1NdIi5JiYkW9eixTrlyfa6cOT9Xhw6rdOnSLev2OnXmq06d+Tb73B5/w4bTku58ifiqVf/o6afnKVu2Scqf/wu9/fbPunYt3rp906Yzeu65H5Qr1+dyc5ugokW/1pAhW2Wx/G8t+bVr8erd+1cFBU2Th8dEPfHELM2ceeCeX8/bta1de1K1as2Xp+cklSgxXV98sdemX1xckoYP36bg4Jny8JioEiWm65NPdtgcv06d+WrbdqVatvxR2bJNUoMGi+557LSYPfugPDwmasuWswoJ+VYeHhNVqtQMhYcf119/RatevYXy8pqk4sWna/78Izb7/v33FbVs+aMCAqYqW7ZJevbZBdq69ZxNn5iYeL377joVKPClsmWbpJCQb7Vy5XHrdrPZoqlT96hcudny9JykwMCv1L//JsXFJdmMs3nzWdWuPV9eXpPk6zs51bywWJI1cOAWFS36tdzdJ6po0a/14YeblJhofuivEQDg0SBgAwCyHCcnk+rVC9SOHRdSrcWWpK1bz6ldu1Vq0aKEVq9urokTn9W6daf16qsrJElTp9ZXpUp5VKlSHm3b1sbmEuWhQ3/T+PF1NGVKPT311J3PrC5YcER//BGpOXMaKSystlauPKEXXlgsszl1LWm1YsVxNW68RHnzZtOiRU31ySe1tGTJ39aa9+2LVL16i+Tv76kFC5ooPDxUNWsW1NCh27Rw4V+SUtYKP/PM95o797D69auqH39sppo1C6pz5zUaNWr7fWt45ZVwVa6cR8uWvaQGDQrrnXd+sYbs5ORkNWmyVGPH/q4uXcopPDxUrVqV0oABW/TWWz//5+vzl3LkcNPy5aHq1y/knsc0m5OVlGRJ9e+/X8vERItefXWlunatoOXLQ+Xl5arXXlupxo2X6MUXiyk8PFT582dXhw6rdfbsdUnSoUOX9eST3+rkyRh9/nk9zZvXWCaT9OyzC7Rx45n/P75FDRv+oLlzD+ujj6pp+fJQBQf7qVmzH/XbbylBvGvXn9Wr168KDS2u5cubqXv3Svr88z/00kvLrDfK27TpzP8HfRctXNhEkyY9qw0bzujZZxcoNjZRkvTJJ79r6tS9GjSohtaubam3366oceN2asSI+39vAACZA5eIAwCypICAbEpMtCgqKlZ589quY968+ay8vFz1wQdV5e6e8qvQz89DO3dGKDk5WWXK+MvbO2X97H8vT37nnYpq2bLUPY/t7++pNWtaKFu2lDFy5/ZSs2bLtHr1CTVuHJSu1zNkyG+qWDGPFi9uKpMpZV25m5uzBg3aqsuXb2n//ktq0KCwvv32Beu68wYNimj58uPasOGMWrcO1uzZf+rgwcv67bc2qlEj5XU991xRJSZaNHz4dr31VgX5+nretYbmzUto0qS61v3On7+p4cO36a23Kuinn07ol19O6fvvG6t162Dr8b28XPTxx1vVs2dllS3r//91O+nLLxtYv/b3Ur/+nc9wly3rp4MHO1o/t1iSNWBANXXpUl6SdOVKnFq3XqFevSqrd+8qkqScOd1Vpcp32rUrQgUL5tDQodvk7u6sX399RTlypHyvXnyxmJ54Yrb69t2o339vq9WrT2jHjgtatqyZXnqpuCTp2WcDdfz4Va1ff1o5c7prxowDGj26pvr3r2Z93fnzZ1e7dqu0evUJvfBCMX344WaVKuWrFSuay9k55fxG9er5VKbMLM2ceVDdulXSxo1nVKVKXnXsWE6SVLt2IXl5uShnTo/7fp0AAJkDZ7ABAFnS7Sds/X8WtVG7diHdvJmoJ56YrQ8/3KTNm8+qYcMiGjToKWt4vZuKFfPc99gvvljMGq6llHXhLi5O2rTp7AO9httiYxP1xx8XFRpawqa+V14J1l9/dZa/v5fatSurVataKCHBrP37L2nx4qMaPHirkpIsio9PuVR5w4YzKlLE2xqub2vbtrTi4pK0ffuFe9bRoUNZm89btCihCxdu6ujRK9qw4YxcXJzUqlXJ/4ydsgb+9hlhKeXmZGkJ15L05ZcNtHNn21T/FixokqrvU08VsH6cN2/K+vVq1fJZ2/z8Ut48uHo15bL6DRvOqHHjYtZwLUkuLk5q3bqUdu2K0I0bCdqy5ZxcXZ3UpMn/3hhxcjLpt9/aaODAGtq4MeV7+uqrwTa1tG4dLGdnkzZsOKNbtxK1ffsFvfhiMSUny3oWvlixnCpd2k8//3xKkvTss4X088+nVLPm9xo37ncdOnRZ3btXtn4NAQCZH2ewAQBZ0tmz1+Xp6WINVf9Wo0Z+rVrVXBMm7NaECbs1ZszvypvXSwMGVNe77977juDZs7ve99gBAbZnzJ2cTPL399SVK3EP9iL+X3R0nJKTpTx57n7Ts9jYRL377np9++0hJSaaVbSoj556qoBcXZ2sbzZER8elqu3f9d4OnndToIDtDdRu1xMdHafo6Dj5+3taz87ea+y0fA1vK1Uql6pUCUhT39tXHfxbtmx3P9a9vh7JyVJMTIKiomLl5+dpvSrgTmPc3uffXFyc5O/vqatX43XlSpwslmR98snv+uST31ON4emZ8udY375VlT27m2bOPKgPPtikfv02qWxZP33+eT09+2zg3V84ACDTIGADALKcpCSLNmw4o6efLpAq8N323HNF9dxzRXXrVqLWrz+tTz/9Qz16rFf16vkUEpLvjvuk1e3QdZvZbNHly7HWQGoymVKtDb9xI/Gu4/n4uMtkks0NsSTp1q1Ebdp0VtWq5dMHH2zSDz8c1cKFTVS/fqD1DHqePFOs/X19PXTs2JVU41+4cFNSyqXt93L5cqzNXdMvXrz1/8fwkq+vhy5fjpXZbLH5ml+4cCNNY9uDr6+HIiJupmq//fXw8/NQzpzuioqKVXJyss3VA3/8cVEmU8oYkhQRcVOFC/tYtycmmnX5cqz8/T3l7Z3y/XvvvSf16qulUx3PyyvlzzEnJ5O6daukbt0qKTLyplatOqGRI7erefMfdfHiO3Jzczb09QMAjMcl4gCALOerr/bpwoW
      "text/plain": [
       "<Figure size 1000x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "grafico(4)\n",
    "plt.figure(figsize=(10, 6), dpi=100)\n",
    "sns.countplot(x=\"emotion\", data=dataset3,order= dataset3['emotion'].value_counts().index,palette=\"husl\")\n",
    "plt.xlabel('Emoçoes', color=\"Darkblue\")\n",
    "plt.xticks(rotation=90)\n",
    "plt.ylabel('Frequência', color='Darkblue')\n",
    "plt.title('Distribuição por Emoções', color='Darkblue')\n",
    "for i, value in enumerate(dataset3['emotion'].value_counts()):\n",
    "    plt.text(i, value+250, str(value),ha='center',color='Darkblue')\n",
    "plt.tight_layout()\n",
    "plt.show() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>emotion</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>That game hurt.</td>\n",
       "      <td>Sadness</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Man I love reddit.</td>\n",
       "      <td>Disgust</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
       "      <td>Sadness</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I just came home, what the fuck is this lineup...</td>\n",
       "      <td>Disgust</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>By far the coolest thing I've seen on this thr...</td>\n",
       "      <td>Happiness</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38253</th>\n",
       "      <td>I just called the Capitol Police. They are not...</td>\n",
       "      <td>Anger</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38254</th>\n",
       "      <td>What a great photo and you two look so happy. 😍</td>\n",
       "      <td>Happiness</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38255</th>\n",
       "      <td>Well, I'm glad you're out of all that now. How...</td>\n",
       "      <td>Happiness</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38256</th>\n",
       "      <td>Everyone likes [NAME].</td>\n",
       "      <td>Disgust</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38257</th>\n",
       "      <td>The FDA has plenty to criticize. But like here...</td>\n",
       "      <td>Anger</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>38258 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text    emotion\n",
       "0                                        That game hurt.    Sadness\n",
       "1                                     Man I love reddit.    Disgust\n",
       "2      So happy for [NAME]. So sad he's not here. Ima...    Sadness\n",
       "3      I just came home, what the fuck is this lineup...    Disgust\n",
       "4      By far the coolest thing I've seen on this thr...  Happiness\n",
       "...                                                  ...        ...\n",
       "38253  I just called the Capitol Police. They are not...      Anger\n",
       "38254    What a great photo and you two look so happy. 😍  Happiness\n",
       "38255  Well, I'm glad you're out of all that now. How...  Happiness\n",
       "38256                             Everyone likes [NAME].    Disgust\n",
       "38257  The FDA has plenty to criticize. But like here...      Anger\n",
       "\n",
       "[38258 rows x 2 columns]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset3 = dataset3.dropna()\n",
    "dataset3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>emotion</th>\n",
       "      <th>emotion_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>That game hurt.</td>\n",
       "      <td>Sadness</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Man I love reddit.</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
       "      <td>Sadness</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I just came home, what the fuck is this lineup...</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>By far the coolest thing I've seen on this thr...</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38253</th>\n",
       "      <td>I just called the Capitol Police. They are not...</td>\n",
       "      <td>Anger</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38254</th>\n",
       "      <td>What a great photo and you two look so happy. 😍</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38255</th>\n",
       "      <td>Well, I'm glad you're out of all that now. How...</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38256</th>\n",
       "      <td>Everyone likes [NAME].</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38257</th>\n",
       "      <td>The FDA has plenty to criticize. But like here...</td>\n",
       "      <td>Anger</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>38258 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text    emotion  \\\n",
       "0                                        That game hurt.    Sadness   \n",
       "1                                     Man I love reddit.    Disgust   \n",
       "2      So happy for [NAME]. So sad he's not here. Ima...    Sadness   \n",
       "3      I just came home, what the fuck is this lineup...    Disgust   \n",
       "4      By far the coolest thing I've seen on this thr...  Happiness   \n",
       "...                                                  ...        ...   \n",
       "38253  I just called the Capitol Police. They are not...      Anger   \n",
       "38254    What a great photo and you two look so happy. 😍  Happiness   \n",
       "38255  Well, I'm glad you're out of all that now. How...  Happiness   \n",
       "38256                             Everyone likes [NAME].    Disgust   \n",
       "38257  The FDA has plenty to criticize. But like here...      Anger   \n",
       "\n",
       "       emotion_label  \n",
       "0                  4  \n",
       "1                  2  \n",
       "2                  4  \n",
       "3                  2  \n",
       "4                  1  \n",
       "...              ...  \n",
       "38253              0  \n",
       "38254              1  \n",
       "38255              1  \n",
       "38256              2  \n",
       "38257              0  \n",
       "\n",
       "[38258 rows x 3 columns]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emotion_mapping = {\"Anger\":0,\"Happiness\":1,\"Disgust\":2,\"Fear\":3, \"Sadness\":4,\"Surprise\":5}\n",
    "dataset3[\"emotion_label\"] = dataset3[\"emotion\"].map(emotion_mapping)\n",
    "dataset3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset3['TextoLema']=preprocess_text_column(dataset3, 'Text')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>emotion</th>\n",
       "      <th>emotion_label</th>\n",
       "      <th>TextoLema</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>That game hurt.</td>\n",
       "      <td>Sadness</td>\n",
       "      <td>4</td>\n",
       "      <td>game hurt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Man I love reddit.</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "      <td>man love reddit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
       "      <td>Sadness</td>\n",
       "      <td>4</td>\n",
       "      <td>happy sad s imagine team instead ugh</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I just came home, what the fuck is this lineup...</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "      <td>come home fuck lineup love mad bastard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>By far the coolest thing I've seen on this thr...</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "      <td>far cool thing ve see thread</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38253</th>\n",
       "      <td>I just called the Capitol Police. They are not...</td>\n",
       "      <td>Anger</td>\n",
       "      <td>0</td>\n",
       "      <td>call capitol police affect shutdown fuck shit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38254</th>\n",
       "      <td>What a great photo and you two look so happy. 😍</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "      <td>great photo look happy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38255</th>\n",
       "      <td>Well, I'm glad you're out of all that now. How...</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "      <td>glad awful way act think healthy boundary hostile</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38256</th>\n",
       "      <td>Everyone likes [NAME].</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "      <td>like</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38257</th>\n",
       "      <td>The FDA has plenty to criticize. But like here...</td>\n",
       "      <td>Anger</td>\n",
       "      <td>0</td>\n",
       "      <td>fda plenty criticize like usually criticize ho...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>38258 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text    emotion  \\\n",
       "0                                        That game hurt.    Sadness   \n",
       "1                                     Man I love reddit.    Disgust   \n",
       "2      So happy for [NAME]. So sad he's not here. Ima...    Sadness   \n",
       "3      I just came home, what the fuck is this lineup...    Disgust   \n",
       "4      By far the coolest thing I've seen on this thr...  Happiness   \n",
       "...                                                  ...        ...   \n",
       "38253  I just called the Capitol Police. They are not...      Anger   \n",
       "38254    What a great photo and you two look so happy. 😍  Happiness   \n",
       "38255  Well, I'm glad you're out of all that now. How...  Happiness   \n",
       "38256                             Everyone likes [NAME].    Disgust   \n",
       "38257  The FDA has plenty to criticize. But like here...      Anger   \n",
       "\n",
       "       emotion_label                                          TextoLema  \n",
       "0                  4                                          game hurt  \n",
       "1                  2                                    man love reddit  \n",
       "2                  4               happy sad s imagine team instead ugh  \n",
       "3                  2             come home fuck lineup love mad bastard  \n",
       "4                  1                       far cool thing ve see thread  \n",
       "...              ...                                                ...  \n",
       "38253              0      call capitol police affect shutdown fuck shit  \n",
       "38254              1                             great photo look happy  \n",
       "38255              1  glad awful way act think healthy boundary hostile  \n",
       "38256              2                                               like  \n",
       "38257              0  fda plenty criticize like usually criticize ho...  \n",
       "\n",
       "[38258 rows x 4 columns]"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset3 = dataset3.dropna(subset=['TextoLema'])\n",
    "#dataset3 = dataset3[dataset3['TextoLema'].str.split().str.len() > 2]\n",
    "dataset3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset3.to_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoemotionsEN.xlsx\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PREPARAÇÃO DOS DADOS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>emotion</th>\n",
       "      <th>emotion_label</th>\n",
       "      <th>TextoLema</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>That game hurt.</td>\n",
       "      <td>Sadness</td>\n",
       "      <td>4</td>\n",
       "      <td>game hurt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Man I love reddit.</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "      <td>man love reddit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
       "      <td>Sadness</td>\n",
       "      <td>4</td>\n",
       "      <td>happy sad s imagine team instead ugh</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I just came home, what the fuck is this lineup...</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "      <td>come home fuck lineup love mad bastard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>By far the coolest thing I've seen on this thr...</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "      <td>far cool thing ve see thread</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38253</th>\n",
       "      <td>I just called the Capitol Police. They are not...</td>\n",
       "      <td>Anger</td>\n",
       "      <td>0</td>\n",
       "      <td>call capitol police affect shutdown fuck shit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38254</th>\n",
       "      <td>What a great photo and you two look so happy. 😍</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "      <td>great photo look happy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38255</th>\n",
       "      <td>Well, I'm glad you're out of all that now. How...</td>\n",
       "      <td>Happiness</td>\n",
       "      <td>1</td>\n",
       "      <td>glad awful way act think healthy boundary hostile</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38256</th>\n",
       "      <td>Everyone likes [NAME].</td>\n",
       "      <td>Disgust</td>\n",
       "      <td>2</td>\n",
       "      <td>like</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38257</th>\n",
       "      <td>The FDA has plenty to criticize. But like here...</td>\n",
       "      <td>Anger</td>\n",
       "      <td>0</td>\n",
       "      <td>fda plenty criticize like usually criticize ho...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>38258 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text    emotion  \\\n",
       "0                                        That game hurt.    Sadness   \n",
       "1                                     Man I love reddit.    Disgust   \n",
       "2      So happy for [NAME]. So sad he's not here. Ima...    Sadness   \n",
       "3      I just came home, what the fuck is this lineup...    Disgust   \n",
       "4      By far the coolest thing I've seen on this thr...  Happiness   \n",
       "...                                                  ...        ...   \n",
       "38253  I just called the Capitol Police. They are not...      Anger   \n",
       "38254    What a great photo and you two look so happy. 😍  Happiness   \n",
       "38255  Well, I'm glad you're out of all that now. How...  Happiness   \n",
       "38256                             Everyone likes [NAME].    Disgust   \n",
       "38257  The FDA has plenty to criticize. But like here...      Anger   \n",
       "\n",
       "       emotion_label                                          TextoLema  \n",
       "0                  4                                          game hurt  \n",
       "1                  2                                    man love reddit  \n",
       "2                  4               happy sad s imagine team instead ugh  \n",
       "3                  2             come home fuck lineup love mad bastard  \n",
       "4                  1                       far cool thing ve see thread  \n",
       "...              ...                                                ...  \n",
       "38253              0      call capitol police affect shutdown fuck shit  \n",
       "38254              1                             great photo look happy  \n",
       "38255              1  glad awful way act think healthy boundary hostile  \n",
       "38256              2                                               like  \n",
       "38257              0  fda plenty criticize like usually criticize ho...  \n",
       "\n",
       "[38258 rows x 4 columns]"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset3=pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoemotionsEN.xlsx\")\n",
    "dataset3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset3['TextoLema'].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset3 = dataset3.dropna(subset=['TextoLema'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "X=dataset3['TextoLema']\n",
    "y=dataset3['emotion_label']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ddafd85f3262424b9f8c4e242a293b68",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/192 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Melhores parâmetros do CountVectorizer (menor esparsidade): (500, 1, 0.5, (1, 3))\n",
      "Esparsidade: 98.23%\n"
     ]
    }
   ],
   "source": [
    "vectorizer_params = {'max_features': [500, 1000, 2000, 5000],'min_df': [1, 2, 3, 5],'max_df': [0.5, 0.6, 0.7, 0.8],'ngram_range': [(1, 1), (1, 2), (1, 3)]}\n",
    "combinations = list(product(vectorizer_params['max_features'], vectorizer_params['min_df'], vectorizer_params['max_df'], vectorizer_params['ngram_range']))\n",
    "best_sparsity = float('inf')\n",
    "best_params = {}\n",
    "with tqdm(total=len(combinations)) as pbar:\n",
    "    for combo in combinations:\n",
    "        max_features, min_df, max_df, ngram_range = combo\n",
    "        vectorizer = CountVectorizer(max_features=max_features,min_df=min_df, max_df=max_df,ngram_range=ngram_range)\n",
    "        X_vec = vectorizer.fit_transform(dataset3['Text'])\n",
    "        sparsity = 100 * (1 - X_vec.nnz / (X_vec.shape[0] * X_vec.shape[1]))\n",
    "        if sparsity < best_sparsity:\n",
    "            best_sparsity = sparsity\n",
    "            best_params = combo\n",
    "        pbar.update(1)\n",
    "print(f\"Melhores parâmetros do CountVectorizer (menor esparsidade): {best_params}\")\n",
    "print(f\"Esparsidade: {best_sparsity:.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Modelos//vectorizerVF.joblib']"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer = CountVectorizer(max_features = 500, min_df = 1, max_df = 0.5,ngram_range=(1,3))\n",
    "X = vectorizer.fit_transform(dataset3['TextoLema']).toarray()\n",
    "joblib.dump(vectorizer, 'Modelos//vectorizerVF.joblib')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "treino 60%, val=20% e teste=20%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)\n",
    "X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## REGRESSÃO LOGISTICA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "\n",
    "undersample = RandomUnderSampler(random_state=42)\n",
    "X_train_balanced, y_train_balanced = undersample.fit_resample(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f1578f07fc5643c9b02b67e18e2006f2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/84 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Melhores Parâmetros: {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}\n",
      "Melhor accuracy: 0.6463089802130898\n"
     ]
    }
   ],
   "source": [
    "param_grid = {\n",
    "    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],\n",
    "    'solver': ['lbfgs', 'liblinear', 'saga'],\n",
    "    'class_weight': [None, 'balanced'],\n",
    "    'max_iter': [500, 1000]\n",
    "}\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
    "grid_search = GridSearchCV(estimator=LogisticRegression(random_state=42), param_grid=param_grid,scoring='accuracy',cv=3, n_jobs=-1)\n",
    "n_combinations = len(list(product(*param_grid.values()))) \n",
    "with tqdm(total=n_combinations) as pbar:   \n",
    "    grid_search.fit(X_train_scaled, y_train_balanced)\n",
    "    pbar.update(n_combinations) \n",
    "print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
    "print(\"Melhor accuracy:\", grid_search.best_score_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Melhores Parâmetros: {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}\n",
    "\n",
    "Melhor accuracy: 0.6601487361454299"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier = LogisticRegression(C=0.001,solver='liblinear',max_iter=500, class_weight=None, random_state = 42)\n",
    "classifier.fit(X_train_balanced,y_train_balanced)\n",
    "y_pred = classifier.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Modelos//logistic_regression_model.joblib']"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(classifier, 'Modelos//logistic_regression_model.joblib')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAHJCAYAAADkVRHSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx5klEQVR4nOzdd1yV1R/A8c+97I1sEBEEcc8cuXKV5irN0jQt907L1DTNbZp75fq5co+0TM2VI/fe4kJRUECG7A33+f1xE7sBhXIBje/79bov9HnOPc/33Pl9zjnPuSpFURSEEEIIIbKhLuwAhBBCCPHqkkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSROE/RNbOEq8ieV0K8XqTRKEANWq0CZVqJnXrbsixzMcf70Slmkm3bnteqO4TJx7TqtX2fy03fvwJVKqZL1R3bhw5EohKNZMjRwL1Ul90dDITJ56kcuXVWFnNw8npB5o23cLOnff0Un92rl0Lp1q1NZiYzKF8+ZV6qzclJZ2mTbdgZDQbA4NZGBrOwspqHnfuPM1z3d267cHTc5keoswdlWom48efyHX55cuvMmzYkcz/r159HZVqJg8exLx0DN267UGlmpnlZmk5j0qVVjN79vmXrvtV9qKPvRD6YljYARQ1arWK06eDefQoDnd3K519CQmpL/1F+L//XcXPL/Jfy/XqVZl33/V6qWMUlFu3ImnRYhsZGQpDhlSnShUnEhLSWL/ej/fe+5lJk+oxZkwdvR93woSTPHwYy88/v4+Tk7ne6lWpVMyb15jExHQMDdWYmBjg4WGNlZWx3o5RUE6d6pzldftPJk8+TaNGJTL/36pVKU6d6oyrq0We4nBxseDnn9/P/L+iQGhoAkuXXuGrr45gZmZI//5V83SMV82LPvZC6IskCgWsenUnbtyIZOvW23z5ZQ2dfTt33sfCwohixUzz7fju7lav9IdNWloGHTrsxMjIgDNnPsbJ6fkXyvvv+9Cnz36+/fYEbdp4U6WKk16PHRmZTKVKDrRsWUqv9RobG1CxoqNe6ywsb77plqf7Ozqa4+iY9yTMxMQg21haty5FqVLLWbXq+n8uUcjrYy/Ey5KhhwJmYWFEq1al2Lr1TpZ9mzff4sMPfTE01H1aIiISGTjwd0qWXIqx8Wzs7BbSrt0vmd233brt4ccfb/DwYSwq1UxWr77OgwcxqFQzmT37PGXLrsTcfC6rVl3TGXp4Via72791Zy9degVf3xWYmc2lYcNNPHwYm6VMYGAsnTrtws5uIebmc2nadAuXLj35x3p3777PtWsRTJ5cTydJeGbixHoMGlSN9HRN5rbz50N5992fsLdfiLX1fNq02c6NGxGZ+58Nixw8+JBmzbZibj4XF5dFfP31H2RkaOvRDpsEcfToo8zHMKdhmr93AW/ceJMqVX7EzGwujo4/0KXLboKD4zP3JyWlMWrUUUqXXo6JyRysrefzzjtbuXw5TKfeAwce0KDBRmxs5mNvv5DOnXcRFJT1cX0Z//YYgbYnp2XLbVhbz8fZeRGjRx+jR4+9NGq0Kce2z5t3gbJlV2JqOofixZcwYMABYmNTAPD0XMbDh7H8+OONzOGG7IYefvvtPvXqbcDCYi5ubovp1+8A0dHJL9VOIyMDLCyMUKl0ty9ffpUKFVZhYjIHD4+ljB9/IvO5f+bHH69Tvry2LVWq/MjBgw8xNJzF6tXXAe2wiaHhLJYvv4qLyyLs7Bbi56d9DHfs8KdGjbWYms7BxWURQ4YcIiEhNbPupKQ0Bgw4gLv7EkxM5lC27Epmzjync/x/eiyze+xDQuLp0WMvJUosxcxsLrVqrePXX/116lSpZrJo0SV69dqHnd1CrKzm0aHDrzx5kvBSj68omiRRKAQdO5bJHH54JjY2hT17AujUqZxOWUVRaNVqO/v3P+D77xuyf/+HjB9fl4MHA+nX7wAA335bh5YtvXBxseDUqc60avX8jHj8+JN8/XVN1q5tyTvveOrU7eqqLf/X29SpDQDo1atSjvEvXHiRfv0O0KpVKXbsaMubb7rSp88BnTIREYnUrbuBCxeesHBhUzZubI1Go/DWW5u4eTPnIZK9ex9gYKDK8azexcWCBQua8sYbLgAcPhxI3bobUBRYtepdli9vRlBQHHXrbuDWLd3jfPLJbho0cGfXrg/o3Lkc06efY/nya4C2W7daNSeqVXPK8hj+kxMnHtO162+0b1+aPXs+YM6cxhw8GEinTrsyy3z66R5WrrzOqFG12b//Q2bPbsSNGxF07rwrc6Lf2rU3aNbsJ0qUsGLjxtbMmdOYU6eCqVNnA2FheftQz81jFBGRyFtvbSIwMJZVq95l/vwm/PTTHTZsuJljvRs33mTEiKMMHFiVffs+ZOzYOqxd68fnnx8E4Oef38fFxYKWLb1yHG7YteserVtvx8nJnC1b2vD992/x88936dhxV5ayf5eersm8paSk8+BBDEOHHub27ad8+mmFzHJTp56hT5/9vP12SXbubMegQdX4/vtz9OmzP7PMmjU36NZtL/XqFWfHjrZ8+KEvbdv+QkaG7kTMjAyFWbPOs2JFc+bMaUS5cvZs2HCTtm1/oWxZO375pS3jx9dl7Vo/3n//l8zn94svDrNnTwAzZzZi3772vP++N8OH/8GqVddy9Vj+3ZMnCdSsuY6jRx/x3Xf12bbtPTw9rWnb9hfWr/fTKfvNN8fJyNCwaVNrZsxoyM6d9/nii8P/+vgK8YwMPRSCVq1KYWFhpDP88PPPd3FyMqd+/eI6ZUNCErCwMGLWrEbUr+8OQKNGHvj7R7Fs2VUAvL1tcXQ01+mOTUjQnrF16FCG7t2z/9I3MTHU6c68dy+amTPP88EHpRk9+s1s76MoCpMmnaZjxzLMmdMYgGbNPImNTWXJkiuZ5ebMuUBkZDInTnSiZEkbAFq08KJcuZWMHXuCrVvfy7b+oKBYHBzMsLTM3fj9yJFHKV26GL/99gEGBurMeLy9lzN27Am2bHl+nN69K/Ptt9q5DU2aePDLL/7s2nWPvn2r8Oabblhba4/5Il28x449wtzciK+/roWJifbtZG9vyrlzoSiKQlqahri4VBYsaEKHDmUBaNiwBLGxqXz11RGePEnEycmcESOO0ry5Jxs2tM6su1694pQvv4qZM88zfXrDXMf0Mo/R/PkXiYtL4/Llj3Bzs/zzcXDF1zfnSZ1//PEILy8bBg6shlqtomHDElhaGvH0qbY3oFo1Z0xMDHB0NM/xMR0//iRVqzqxffv7qP7sBjA2NmDs2BM8eZKAs3P2cxkePozFyGh2lu2lSxdj0aK36devCgAxMSlMmnSKvn2rMG9ek8y229ub0avXPoYOrUGFCg58++1x2rTx5n//aw5A8+ZeGBmpGTXqWJZjjB79Jq1aeQPa98PXXx/l3Xc9WbeulU4cb7+9ld9+u0+rVt788ccj3nnHk48/1r4GGjXywNLSOHMuzL89ln83e/Z5wsOTuHOnR+b7q2XLUrz99haGDfuDTp3KoVZrH89KlRxYtapF5n3Png1l69bb2dYrRHakR6EQmJkZ0aaNt87ww6ZNt+jYsUzmh+Uzbm6WHDrUkXr1ivPgQQwHDjxgwYKLnDgRTEpKxr8eq2rV3I2Nx8am8N57P+PiYs6PP7bIEsczt28/JSwskTZtvHW2d+hQRuf/Bw8GUrWqI8WLW2We9anVKlq08OLAgYc5xmFoqM5yFpeThIRUzp0LpUOHMplfgAC2tqa0aePNkSNBOuXr1NH9snJ3tyIhIS1Xx8pJw4YlSEhIo2LF1YwadZRjxx7RrJknY8fWRaVSYWxswN69H9KhQ1keP47j8OFAli69wq5d2kmrKSnp3L79lNDQBDp1KqtTt7e3LXXquGZpx4vI7WN06FAQdeu6ZSYJACVL2lC3bs5JU+PGJbh9+ylvvLGWiRNPcv58KJ07l+Pzz6vnKrakpDQuXnxCu3aldV5vHTuW5fbtnjkmCaDtDTt3rgvnznVh//4PadDAHTc3S378sQX9+1fNrO/UqWC
      "text/plain": [
       "<Figure size 640x480 with 2 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cm = confusion_matrix(y_test, y_pred)\n",
    "CMatrix = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)\n",
    "CMatrix.plot()\n",
    "plt.title('Matriz de Confusão Logistic Regression', color=\"darkblue\")\n",
    "plt.xlabel('Previsões', color=\"darkblue\")\n",
    "plt.ylabel('Real', color=\"darkblue\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.68      0.49      0.57      1565\n",
      "           1       0.73      0.55      0.63      1578\n",
      "           2       0.77      0.78      0.77      1534\n",
      "           3       0.26      0.59      0.36       584\n",
      "           4       0.65      0.55      0.60      1349\n",
      "           5       0.46      0.58      0.51      1015\n",
      "\n",
      "    accuracy                           0.59      7625\n",
      "   macro avg       0.59      0.59      0.57      7625\n",
      "weighted avg       0.64      0.59      0.60      7625\n",
      "\n"
     ]
    }
   ],
   "source": [
    "classes = np.unique(y)\n",
    "print(classification_report(y_test.tolist(), y_pred.tolist(), labels=classes.tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "75d04b694d4247f2a8297581f2b64cc9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/13122 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[101], line 17\u001b[0m\n\u001b[0;32m     15\u001b[0m n_combinations \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mlist\u001b[39m(product(\u001b[38;5;241m*\u001b[39mparam_grid\u001b[38;5;241m.\u001b[39mvalues()))) \n\u001b[0;32m     16\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m tqdm(total\u001b[38;5;241m=\u001b[39mn_combinations) \u001b[38;5;28;01mas\u001b[39;00m pbar:   \n\u001b[1;32m---> 17\u001b[0m     \u001b[43mgrid_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_scaled\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train_balanced\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     18\u001b[0m     pbar\u001b[38;5;241m.\u001b[39mupdate(n_combinations)\n\u001b[0;32m     19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMelhores Parâmetros:\u001b[39m\u001b[38;5;124m\"\u001b[39m, grid_search\u001b[38;5;241m.\u001b[39mbest_params_)\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1466\u001b[0m     estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m   1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m   1469\u001b[0m     skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m   1470\u001b[0m         prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m   1471\u001b[0m     )\n\u001b[0;32m   1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:1019\u001b[0m, in \u001b[0;36mBaseSearchCV.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m   1013\u001b[0m     results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_results(\n\u001b[0;32m   1014\u001b[0m         all_candidate_params, n_splits, all_out, all_more_results\n\u001b[0;32m   1015\u001b[0m     )\n\u001b[0;32m   1017\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m-> 1019\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluate_candidates\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1021\u001b[0m \u001b[38;5;66;03m# multimetric is determined here because in the case of a callable\u001b[39;00m\n\u001b[0;32m   1022\u001b[0m \u001b[38;5;66;03m# self.scoring the return type is only known after calling\u001b[39;00m\n\u001b[0;32m   1023\u001b[0m first_test_score \u001b[38;5;241m=\u001b[39m all_out[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_scores\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:1573\u001b[0m, in \u001b[0;36mGridSearchCV._run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m   1571\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_run_search\u001b[39m(\u001b[38;5;28mself\u001b[39m, evaluate_candidates):\n\u001b[0;32m   1572\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Search all candidates in param_grid\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1573\u001b[0m     \u001b[43mevaluate_candidates\u001b[49m\u001b[43m(\u001b[49m\u001b[43mParameterGrid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparam_grid\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:965\u001b[0m, in \u001b[0;36mBaseSearchCV.fit.<locals>.evaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m    957\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m    958\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\n\u001b[0;32m    959\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFitting \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m folds for each of \u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m candidates,\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    960\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m totalling \u001b[39m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m fits\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m    961\u001b[0m             n_splits, n_candidates, n_candidates \u001b[38;5;241m*\u001b[39m n_splits\n\u001b[0;32m    962\u001b[0m         )\n\u001b[0;32m    963\u001b[0m     )\n\u001b[1;32m--> 965\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    966\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    967\u001b[0m \u001b[43m        \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase_estimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    968\u001b[0m \u001b[43m        \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    969\u001b[0m \u001b[43m        \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    970\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    971\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    972\u001b[0m \u001b[43m        \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    973\u001b[0m \u001b[43m        \u001b[49m\u001b[43msplit_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_splits\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    974\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcandidate_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_candidates\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    975\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_and_score_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    976\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    977\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mproduct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    978\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcandidate_params\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m     69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m     70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m     71\u001b[0m     (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m     72\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m     73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m   2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m   2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m   2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m   2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m   1647\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m   1649\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m         \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m   1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m   1653\u001b[0m     \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m   1654\u001b[0m     \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m   1655\u001b[0m     \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m   1656\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m   1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m   1760\u001b[0m     (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m   1761\u001b[0m         timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m     \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1763\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m   1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m   1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m   1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "param_grid = {\n",
    "    'n_estimators': [50, 100, 200],\n",
    "    'learning_rate': [0.01, 0.1, 0.2],\n",
    "    'max_depth': [3, 5, 10],\n",
    "    'subsample': [0.8, 1.0],\n",
    "    'colsample_bytree': [0.6, 0.8, 1.0],\n",
    "    'gamma': [0, 0.1, 0.5],\n",
    "    'min_child_weight': [1, 3, 5],\n",
    "    'reg_alpha': [0, 0.01, 0.1],\n",
    "    'reg_lambda': [1, 1.5, 2]\n",
    "}\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
    "grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, tree_method='gpu_hist'),param_grid=param_grid,scoring='accuracy',cv=2, n_jobs=-1)\n",
    "n_combinations = len(list(product(*param_grid.values()))) \n",
    "with tqdm(total=n_combinations) as pbar:   \n",
    "    grid_search.fit(X_train_scaled, y_train_balanced)\n",
    "    pbar.update(n_combinations)\n",
    "print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
    "print(\"Melhor accuracy:\", grid_search.best_score_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Melhores Parâmetros: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}\n",
    "\n",
    "Melhor accuracy: 0.6592488954270308"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\garci\\anaconda32\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [15:50:34] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n",
      "Parameters: { \"gama\" } are not used.\n",
      "\n",
      "  warnings.warn(smsg, UserWarning)\n"
     ]
    }
   ],
   "source": [
    "xgb = XGBClassifier(colsample_bytree=0.8, gama=0.1, learning_rate= 0.1, max_depth=5,min_child_weight=1, n_estimators=100, subsample=0.8, reg_alpha=0, reg_lambda=1)\n",
    "xgb.fit(X_train_scaled,y_train_balanced)\n",
    "previsoesxgb = xgb.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Modelos//logistic_regression_model.joblib']"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(xgb, 'Modelos//logistic_regression_model.joblib')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAHJCAYAAADkVRHSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx5klEQVR4nOzdd1yV1R/A8c+97I1sEBEEcc8cuXKV5irN0jQt907L1DTNbZp75fq5co+0TM2VI/fe4kJRUECG7A33+f1xE7sBhXIBje/79bov9HnOPc/33Pl9zjnPuSpFURSEEEIIIbKhLuwAhBBCCPHqkkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSROE/RNbOEq8ieV0K8XqTRKEANWq0CZVqJnXrbsixzMcf70Slmkm3bnteqO4TJx7TqtX2fy03fvwJVKqZL1R3bhw5EohKNZMjRwL1Ul90dDITJ56kcuXVWFnNw8npB5o23cLOnff0Un92rl0Lp1q1NZiYzKF8+ZV6qzclJZ2mTbdgZDQbA4NZGBrOwspqHnfuPM1z3d267cHTc5keoswdlWom48efyHX55cuvMmzYkcz/r159HZVqJg8exLx0DN267UGlmpnlZmk5j0qVVjN79vmXrvtV9qKPvRD6YljYARQ1arWK06eDefQoDnd3K519CQmpL/1F+L//XcXPL/Jfy/XqVZl33/V6qWMUlFu3ImnRYhsZGQpDhlSnShUnEhLSWL/ej/fe+5lJk+oxZkwdvR93woSTPHwYy88/v4+Tk7ne6lWpVMyb15jExHQMDdWYmBjg4WGNlZWx3o5RUE6d6pzldftPJk8+TaNGJTL/36pVKU6d6oyrq0We4nBxseDnn9/P/L+iQGhoAkuXXuGrr45gZmZI//5V83SMV82LPvZC6IskCgWsenUnbtyIZOvW23z5ZQ2dfTt33sfCwohixUzz7fju7lav9IdNWloGHTrsxMjIgDNnPsbJ6fkXyvvv+9Cnz36+/fYEbdp4U6WKk16PHRmZTKVKDrRsWUqv9RobG1CxoqNe6ywsb77plqf7Ozqa4+iY9yTMxMQg21haty5FqVLLWbXq+n8uUcjrYy/Ey5KhhwJmYWFEq1al2Lr1TpZ9mzff4sMPfTE01H1aIiISGTjwd0qWXIqx8Wzs7BbSrt0vmd233brt4ccfb/DwYSwq1UxWr77OgwcxqFQzmT37PGXLrsTcfC6rVl3TGXp4Via72791Zy9degVf3xWYmc2lYcNNPHwYm6VMYGAsnTrtws5uIebmc2nadAuXLj35x3p3777PtWsRTJ5cTydJeGbixHoMGlSN9HRN5rbz50N5992fsLdfiLX1fNq02c6NGxGZ+58Nixw8+JBmzbZibj4XF5dFfP31H2RkaOvRDpsEcfToo8zHMKdhmr93AW/ceJMqVX7EzGwujo4/0KXLboKD4zP3JyWlMWrUUUqXXo6JyRysrefzzjtbuXw5TKfeAwce0KDBRmxs5mNvv5DOnXcRFJT1cX0Z//YYgbYnp2XLbVhbz8fZeRGjRx+jR4+9NGq0Kce2z5t3gbJlV2JqOofixZcwYMABYmNTAPD0XMbDh7H8+OONzOGG7IYefvvtPvXqbcDCYi5ubovp1+8A0dHJL9VOIyMDLCyMUKl0ty9ffpUKFVZhYjIHD4+ljB9/IvO5f+bHH69Tvry2LVWq/MjBgw8xNJzF6tXXAe2wiaHhLJYvv4qLyyLs7Bbi56d9DHfs8KdGjbWYms7BxWURQ4YcIiEhNbPupKQ0Bgw4gLv7EkxM5lC27Epmzjync/x/eiyze+xDQuLp0WMvJUosxcxsLrVqrePXX/116lSpZrJo0SV69dqHnd1CrKzm0aHDrzx5kvBSj68omiRRKAQdO5bJHH54JjY2hT17AujUqZxOWUVRaNVqO/v3P+D77xuyf/+HjB9fl4MHA+nX7wAA335bh5YtvXBxseDUqc60avX8jHj8+JN8/XVN1q5tyTvveOrU7eqqLf/X29SpDQDo1atSjvEvXHiRfv0O0KpVKXbsaMubb7rSp88BnTIREYnUrbuBCxeesHBhUzZubI1Go/DWW5u4eTPnIZK9ex9gYKDK8azexcWCBQua8sYbLgAcPhxI3bobUBRYtepdli9vRlBQHHXrbuDWLd3jfPLJbho0cGfXrg/o3Lkc06efY/nya4C2W7daNSeqVXPK8hj+kxMnHtO162+0b1+aPXs+YM6cxhw8GEinTrsyy3z66R5WrrzOqFG12b//Q2bPbsSNGxF07rwrc6Lf2rU3aNbsJ0qUsGLjxtbMmdOYU6eCqVNnA2FheftQz81jFBGRyFtvbSIwMJZVq95l/vwm/PTTHTZsuJljvRs33mTEiKMMHFiVffs+ZOzYOqxd68fnnx8E4Oef38fFxYKWLb1yHG7YteserVtvx8nJnC1b2vD992/x88936dhxV5ayf5eersm8paSk8+BBDEOHHub27ad8+mmFzHJTp56hT5/9vP12SXbubMegQdX4/vtz9OmzP7PMmjU36NZtL/XqFWfHjrZ8+KEvbdv+QkaG7kTMjAyFWbPOs2JFc+bMaUS5cvZs2HCTtm1/oWxZO375pS3jx9dl7Vo/3n//l8zn94svDrNnTwAzZzZi3772vP++N8OH/8GqVddy9Vj+3ZMnCdSsuY6jRx/x3Xf12bbtPTw9rWnb9hfWr/fTKfvNN8fJyNCwaVNrZsxoyM6d9/nii8P/+vgK8YwMPRSCVq1KYWFhpDP88PPPd3FyMqd+/eI6ZUNCErCwMGLWrEbUr+8OQKNGHvj7R7Fs2VUAvL1tcXQ01+mOTUjQnrF16FCG7t2z/9I3MTHU6c68dy+amTPP88EHpRk9+s1s76MoCpMmnaZjxzLMmdMYgGbNPImNTWXJkiuZ5ebMuUBkZDInTnSiZEkbAFq08KJcuZWMHXuCrVvfy7b+oKBYHBzMsLTM3fj9yJFHKV26GL/99gEGBurMeLy9lzN27Am2bHl+nN69K/Ptt9q5DU2aePDLL/7s2nWPvn2r8Oabblhba4/5Il28x449wtzciK+/roWJifbtZG9vyrlzoSiKQlqahri4VBYsaEKHDmUBaNiwBLGxqXz11RGePEnEycmcESOO0ry5Jxs2tM6su1694pQvv4qZM88zfXrDXMf0Mo/R/PkXiYtL4/Llj3Bzs/zzcXDF1zfnSZ1//PEILy8bBg6shlqtomHDElhaGvH0qbY3oFo1Z0xMDHB0NM/xMR0//iRVqzqxffv7qP7sBjA2NmDs2BM8eZKAs3P2cxkePozFyGh2lu2lSxdj0aK36devCgAxMSlMmnSKvn2rMG9ek8y229ub0avXPoYOrUGFCg58++1x2rTx5n//aw5A8+ZeGBmpGTXqWJZjjB79Jq1aeQPa98PXXx/l3Xc9WbeulU4cb7+9ld9+u0+rVt788ccj3nnHk48/1r4GGjXywNLSOHMuzL89ln83e/Z5wsOTuHOnR+b7q2XLUrz99haGDfuDTp3KoVZrH89KlRxYtapF5n3Png1l69bb2dYrRHakR6EQmJkZ0aaNt87ww6ZNt+jYsUzmh+Uzbm6WHDrUkXr1ivPgQQwHDjxgwYKLnDgRTEpKxr8eq2rV3I2Nx8am8N57P+PiYs6PP7bIEsczt28/JSwskTZtvHW2d+hQRuf/Bw8GUrWqI8WLW2We9anVKlq08OLAgYc5xmFoqM5yFpeThIRUzp0LpUOHMplfgAC2tqa0aePNkSNBOuXr1NH9snJ3tyIhIS1Xx8pJw4YlSEhIo2LF1YwadZRjxx7RrJknY8fWRaVSYWxswN69H9KhQ1keP47j8OFAli69wq5d2kmrKSnp3L79lNDQBDp1KqtTt7e3LXXquGZpx4vI7WN06FAQdeu6ZSYJACVL2lC3bs5JU+PGJbh9+ylvvLGWiRNPcv58KJ07l+Pzz6vnKrakpDQuXnxCu3aldV5vHTuW5fbtnjkmCaDtDTt3rgvnznVh//4PadDAHTc3S378sQX9+1fNrO/UqWC
      "text/plain": [
       "<Figure size 640x480 with 2 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cm2 = confusion_matrix(y_test, y_pred)\n",
    "CMatrix = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=xgb.classes_)\n",
    "CMatrix.plot()\n",
    "plt.title('Matriz de Confusão Logistic Regression', color=\"darkblue\")\n",
    "plt.xlabel('Previsões', color=\"darkblue\")\n",
    "plt.ylabel('Real', color=\"darkblue\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.52      0.77      0.62      1565\n",
      "           1       0.77      0.58      0.66      1578\n",
      "           2       0.78      0.82      0.80      1534\n",
      "           3       0.66      0.50      0.57       584\n",
      "           4       0.71      0.58      0.64      1349\n",
      "           5       0.61      0.56      0.58      1015\n",
      "\n",
      "    accuracy                           0.66      7625\n",
      "   macro avg       0.67      0.64      0.65      7625\n",
      "weighted avg       0.68      0.66      0.66      7625\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(classification_report(y_test, previsoesxgb))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "65bf7c3e26fe4a3eb4a49169477f0d65",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Melhores Parâmetros: {'var_smoothing': 1e-05}\n",
      "Melhor accuracy: 0.31687985071238955\n"
     ]
    }
   ],
   "source": [
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
    "param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}\n",
    "model = GaussianNB()\n",
    "grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)\n",
    "n_combinations = len(list(product(*param_grid.values()))) \n",
    "with tqdm(total=n_combinations) as pbar:   \n",
    "    grid_search.fit(X_train_scaled, y_train)\n",
    "    pbar.update(n_combinations)\n",
    "print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
    "print(\"Melhor accuracy:\", grid_search.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc=StandardScaler()\n",
    "x_treino2=sc.fit_transform(X_train_scaled)\n",
    "x_teste2=sc.transform(X_test)\n",
    "naive=GaussianNB(var_smoothing= 1e-05)\n",
    "naive.fit(x_treino2, y_train_balanced)\n",
    "previsoesnb = naive.predict(x_teste2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}