Files
INTUIA/Testes/Sentiment Analysis.ipynb
T

1519 lines
228 KiB
Plaintext
Raw Normal View History

2026-03-15 13:27:50 +00:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## BIBLIOTECA"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"import statistics as st\n",
"from statistics import mode, mean\n",
"from scipy.stats import spearmanr, pearsonr, skew, kendalltau, norm\n",
"from collections import Counter\n",
"from datetime import datetime\n",
"from keras.models import load_model\n",
"import warnings\n",
"#!pip install wordcloud\n",
"import string\n",
"from wordcloud import WordCloud\n",
"from xgboost import XGBClassifier\n",
"import nltk\n",
"import torch\n",
"from torch import nn\n",
"from torch.utils.data import DataLoader, Dataset\n",
"#nltk.download('all')\n",
"import spacy\n",
"import pickle \n",
"from sklearn.datasets import load_files, make_classification\n",
"from docx import Document\n",
"import os\n",
"import re\n",
"import pdfplumber\n",
"import gensim\n",
"#!pip install gensim\n",
"from gensim.models import FastText\n",
"from nltk.corpus import stopwords\n",
"from joblib import Parallel, delayed\n",
"from nltk.tokenize import word_tokenize\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from textblob import TextBlob\n",
"from deep_translator import GoogleTranslator\n",
"from pyannote.audio import Pipeline\n",
"from transformers import pipeline\n",
"from nltk.stem import WordNetLemmatizer\n",
"from reportlab.lib.pagesizes import A4\n",
"from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Image\n",
"from reportlab.lib import colors\n",
"from PIL import Image, ImageOps\n",
"from tkinter import filedialog\n",
"from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid\n",
"import tabula\n",
"import PyPDF2\n",
"import pdfplumber\n",
"from datetime import datetime\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.svm import SVC\n",
"from imblearn.combine import SMOTEENN\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import joblib\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"import fitz\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, classification_report, f1_score, precision_score, recall_score, roc_auc_score\n",
"from tqdm.notebook import tqdm\n",
"from tensorflow.keras.models import Sequential, load_model\n",
"from tensorflow.keras.optimizers import Adam\n",
"from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Flatten, Dropout, BatchNormalization\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from keras.utils import to_categorical\n",
"from itertools import product\n",
"#!python -m spacy download pt_core_news_sm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## FUNÇÕES"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"def grafico(a):\n",
" if a == 1:\n",
" custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False}\n",
" sns.set_theme(style=\"ticks\", rc=custom_params, palette = \"pastel\")\n",
" palette = \"pastel\"\n",
" elif a == 2:\n",
" sns.set_theme(style=\"white\", palette = \"Set2\")\n",
" palette = \"Set2\"\n",
" elif a == 3:\n",
" sns.set_theme(style=\"whitegrid\", palette = \"pastel\")\n",
" palette = \"pastel\"\n",
" elif a == 4:\n",
" sns.set_theme()\n",
" palette = \"husl\"\n",
" return palette"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tratamento para portugues"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
"C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\3345901888.py:8: SyntaxWarning: invalid escape sequence '\\s'\n",
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '')\n"
]
}
],
"source": [
"portuguesstopwords = set(stopwords.words('portuguese'))\n",
"nlp = spacy.load(\"pt_core_news_sm\")\n",
"def preprocess_text_column_pt(df, column_name):\n",
" if column_name not in df.columns:\n",
" raise ValueError(f\"A coluna '{column_name}' não existe no dataset.\")\n",
" df_copy = df.copy()\n",
" df_copy[column_name] = df_copy[column_name].str.lower() \n",
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '') \n",
" df_copy[column_name] = df_copy[column_name].str.strip() \n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True) \n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n",
" df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n",
" df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in portuguesstopwords]))\n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True) \n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True) \n",
" df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))\n",
" df_copy['TextoLema'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)])) \n",
"\n",
" return df_copy['TextoLema']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tratamento para ingles"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
"C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\4039191732.py:8: SyntaxWarning: invalid escape sequence '\\s'\n",
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n"
]
}
],
"source": [
"english_stopwords = set(stopwords.words('english'))\n",
"nlp = spacy.load(\"en_core_web_sm\")\n",
"def preprocess_text_column(df, column_name):\n",
" if column_name not in df.columns:\n",
" raise ValueError(f\"The column '{column_name}' does not exist in the dataset.\")\n",
" df_copy = df.copy()\n",
" df_copy[column_name] = df_copy[column_name].str.lower()\n",
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True).str.strip()\n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n",
" df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True)\n",
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True)\n",
" df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in english_stopwords and len(word) > 2]))\n",
" df_copy['LemmatizedText'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_punct and not token.is_stop]))\n",
" return df_copy['LemmatizedText']"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Text emotion\n",
"0 That game hurt. Sadness\n",
"1 Man I love reddit. Disgust\n",
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness\n",
"3 I just came home, what the fuck is this lineup... Disgust\n",
"4 By far the coolest thing I've seen on this thr... Happiness\n"
]
}
],
"source": [
"colunas = [\"Text\", \"emotion\"]\n",
"dataset3 = pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoEmotions_parateste.xlsx\",names=colunas)\n",
"print(dataset3.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Sadness', 'Disgust', 'Happiness', 'Surprise', 'Anger', 'Fear'],\n",
" dtype=object)"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset3[\"emotion\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>V. Absolutos</th>\n",
" <th>V. Relativos (%)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>emotion</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Happiness</th>\n",
" <td>7907</td>\n",
" <td>20.67</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Anger</th>\n",
" <td>7844</td>\n",
" <td>20.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Disgust</th>\n",
" <td>7678</td>\n",
" <td>20.07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sadness</th>\n",
" <td>6758</td>\n",
" <td>17.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Surprise</th>\n",
" <td>5144</td>\n",
" <td>13.45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fear</th>\n",
" <td>2927</td>\n",
" <td>7.65</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" V. Absolutos V. Relativos (%)\n",
"emotion \n",
"Happiness 7907 20.67\n",
"Anger 7844 20.50\n",
"Disgust 7678 20.07\n",
"Sadness 6758 17.66\n",
"Surprise 5144 13.45\n",
"Fear 2927 7.65"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({'V. Absolutos':dataset3['emotion'].value_counts(), 'V. Relativos (%)':(dataset3['emotion'].value_counts()*100/dataset3.shape[0]).round(2)})"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\1100090595.py:3: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.countplot(x=\"emotion\", data=dataset3,order= dataset3['emotion'].value_counts().index,palette=\"husl\")\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA9gAAAJICAYAAACaO0yGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACEgElEQVR4nOzdd1yV5f/H8fdhgwoKqLhw4EDNmTgqR66y1MRRZo4cZaWpmZqluVeKo1Ib7kpz5EgcaWnO1NScqZnmVkRBxcE8h98f/DzfTjgQbzwcfD0fDx/BdV/3dX8OXAHvc9/XfZuSk5OTBQAAAAAAHoqTvQsAAAAAACArIGADAAAAAGAAAjYAAAAAAAYgYAMAAAAAYAACNgAAAAAABiBgAwAAAABgAAI2AAAAAAAGIGADAAAAAGAAAjYAAHeRnJxs7xIAAIADIWADABxSnTrzZTKFWf85OYUpR45PVaXKt/rssz+UlGSx6V+kyNd6/fXVaR5/+fJj6tDh/v1ff321ihT5Ot3HuZvZsw/KZArTyZPX0rzPDz/8JX//KXJyCpOLy3g5O4/Xm2+ufeha7GHDhtM23987/fvppxP2LlOSFBubqGefXWD9mru4jJePz2c6ceKqvUsDADxiLvYuAACA9KpUKY+mTq0vSTKbkxUdHafVq//Re+/9qs2bz2rBgiZycjJJkpYufUne3u5pHnvChF1p6vfxxzXUs2flBy/+Pl58sZi2bWujfPmypXmf6tXz65dfWikx0Sw3N2d5e7upaNGchtf2KE2ZUk+VK+e947bSpf0ecTV35uRk0uef19WtW0lycXGSu7uzChf2VvbsbvYuDQDwiBGwAQAOy9vbTdWr57dpa9IkSMHBfurZc72+//6wXnutjCSpUqU7h7SHFRSUM0PGzZ3bS7lzez3QPgUL5lDBgjkypB57KVPGL9X3OLNxd3fRE0/ktncZAIBMgEvEAQBZTvfulVSgQHZ9+eU+a9t/L93+/vvDqlBhjjw9Jyl37ilq23alzp+/ISnl8vONG89q48azMpnCtGHDaesly199tU+FC38lb+/P9PPPJ1NdIi5JiYkW9eixTrlyfa6cOT9Xhw6rdOnSLev2OnXmq06d+Tb73B5/w4bTku58ifiqVf/o6afnKVu2Scqf/wu9/fbPunYt3rp906Yzeu65H5Qr1+dyc5ugokW/1pAhW2Wx/G8t+bVr8erd+1cFBU2Th8dEPfHELM2ceeCeX8/bta1de1K1as2Xp+cklSgxXV98sdemX1xckoYP36bg4Jny8JioEiWm65NPdtgcv06d+WrbdqVatvxR2bJNUoMGi+557LSYPfugPDwmasuWswoJ+VYeHhNVqtQMhYcf119/RatevYXy8pqk4sWna/78Izb7/v33FbVs+aMCAqYqW7ZJevbZBdq69ZxNn5iYeL377joVKPClsmWbpJCQb7Vy5XHrdrPZoqlT96hcudny9JykwMCv1L//JsXFJdmMs3nzWdWuPV9eXpPk6zs51bywWJI1cOAWFS36tdzdJ6po0a/14YeblJhofuivEQDg0SBgAwCyHCcnk+rVC9SOHRdSrcWWpK1bz6ldu1Vq0aKEVq9urokTn9W6daf16qsrJElTp9ZXpUp5VKlSHm3b1sbmEuWhQ3/T+PF1NGVKPT311J3PrC5YcER//BGpOXMaKSystlauPKEXXlgsszl1LWm1YsVxNW68RHnzZtOiRU31ySe1tGTJ39aa9+2LVL16i+Tv76kFC5ooPDxUNWsW1NCh27Rw4V+SUtYKP/PM95o797D69auqH39sppo1C6pz5zUaNWr7fWt45ZVwVa6cR8uWvaQGDQrrnXd+sYbs5ORkNWmyVGPH/q4uXcopPDxUrVqV0oABW/TWWz//5+vzl3LkcNPy5aHq1y/knsc0m5OVlGRJ9e+/X8vERItefXWlunatoOXLQ+Xl5arXXlupxo2X6MUXiyk8PFT582dXhw6rdfbsdUnSoUOX9eST3+rkyRh9/nk9zZvXWCaT9OyzC7Rx45n/P75FDRv+oLlzD+ujj6pp+fJQBQf7qVmzH/XbbylBvGvXn9Wr168KDS2u5cubqXv3Svr88z/00kvLrDfK27TpzP8HfRctXNhEkyY9qw0bzujZZxcoNjZRkvTJJ79r6tS9GjSohtaubam3366oceN2asSI+39vAACZA5eIAwCypICAbEpMtCgqKlZ589quY968+ay8vFz1wQdV5e6e8qvQz89DO3dGKDk5WWXK+MvbO2X97H8vT37nnYpq2bLUPY/t7++pNWtaKFu2lDFy5/ZSs2bLtHr1CTVuHJSu1zNkyG+qWDGPFi9uKpMpZV25m5uzBg3aqsuXb2n//ktq0KCwvv32Beu68wYNimj58uPasOGMWrcO1uzZf+rgwcv67bc2qlEj5XU991xRJSZaNHz4dr31VgX5+nretYbmzUto0qS61v3On7+p4cO36a23Kuinn07ol19O6fvvG6t162Dr8b28XPTxx1vVs2dllS3r//91O+nLLxtYv/b3Ur/+nc9wly3rp4MHO1o/t1iSNWBANXXpUl6SdOVKnFq3XqFevSqrd+8qkqScOd1Vpcp32rUrQgUL5tDQodvk7u6sX399RTlypHyvXnyxmJ54Yrb69t2o339vq9WrT2jHjgtatqyZXnqpuCTp2WcDdfz4Va1ff1o5c7prxowDGj26pvr3r2Z93fnzZ1e7dqu0evUJvfBCMX344WaVKuWrFSuay9k55fxG9er5VKbMLM2ceVDdulXSxo1nVKVKXnXsWE6SVLt2IXl5uShnTo/7fp0AAJkDZ7ABAFnS7Sds/X8WtVG7diHdvJmoJ56YrQ8/3KTNm8+qYcMiGjToKWt4vZuKFfPc99gvvljMGq6llHXhLi5O2rTp7AO9httiYxP1xx8XFRpawqa+V14J1l9/dZa/v5fatSurVataKCHBrP37L2nx4qMaPHirkpIsio9PuVR5w4YzKlLE2xqub2vbtrTi4pK0ffuFe9bRoUNZm89btCihCxdu6ujRK9qw4YxcXJzUqlXJ/4ydsgb+9hlhKeXmZGkJ15L05ZcNtHNn21T/FixokqrvU08VsH6cN2/K+vVq1fJZ2/z8Ut48uHo15bL6DRvOqHHjYtZwLUkuLk5q3bqUdu2K0I0bCdqy5ZxcXZ3UpMn/3hhxcjLpt9/aaODAGtq4MeV7+uqrwTa1tG4dLGdnkzZsOKNbtxK1ffsFvfhiMSUny3oWvlixnCpd2k8//3xKkvTss4X088+nVLPm9xo37ncdOnRZ3btXtn4NAQCZH2ewAQBZ0tmz1+Xp6WINVf9Wo0Z+rVrVXBMm7NaECbs1ZszvypvXSwMGVNe77977juDZs7ve99gBAbZnzJ2cTPL399SVK3EP9iL+X3R0nJKTpTx57n7Ts9jYRL377np9++0hJSaaVbSoj556qoBcXZ2sbzZER8elqu3f9d4OnndToIDtDdRu1xMdHafo6Dj5+3taz87ea+y0fA1vK1Uql6pUCUhT39tXHfxbtmx3P9a9vh7JyVJMTIKiomLl5+dpvSrgTmPc3uffXFyc5O/vqatX43XlSpwslmR98snv+uST31ON4emZ8udY375VlT27m2bOPKgPPtikfv02qWxZP33+eT09+2zg3V84ACDTIGADALKcpCSLNmw4o6efLpAq8N323HNF9dxzRXXrVqLWrz+tTz/9Qz16rFf16vkUEpLvjvuk1e3QdZvZbNHly7HWQGoymVKtDb9xI/Gu4/n4uMtkks0NsSTp1q1Ebdp0VtWq5dMHH2zSDz8c1cKFTVS/fqD1DHqePFOs/X19PXTs2JVU41+4cFNSyqXt93L5cqzNXdMvXrz1/8fwkq+vhy5fjpXZbLH5ml+4cCNNY9uDr6+HIiJupmq//fXw8/NQzpzuioqKVXJyss3VA3/8cVEmU8oYkhQRcVOFC/tYtycmmnX5cqz8/T3l7Z3y/XvvvSf16qulUx3PyyvlzzEnJ5O6daukbt0qKTLyplatOqGRI7erefMfdfHiO3Jzczb09QMAjMcl4gCALOerr/bpwoW
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"grafico(4)\n",
"plt.figure(figsize=(10, 6), dpi=100)\n",
"sns.countplot(x=\"emotion\", data=dataset3,order= dataset3['emotion'].value_counts().index,palette=\"husl\")\n",
"plt.xlabel('Emoçoes', color=\"Darkblue\")\n",
"plt.xticks(rotation=90)\n",
"plt.ylabel('Frequência', color='Darkblue')\n",
"plt.title('Distribuição por Emoções', color='Darkblue')\n",
"for i, value in enumerate(dataset3['emotion'].value_counts()):\n",
" plt.text(i, value+250, str(value),ha='center',color='Darkblue')\n",
"plt.tight_layout()\n",
"plt.show() "
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Text</th>\n",
" <th>emotion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>That game hurt.</td>\n",
" <td>Sadness</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Man I love reddit.</td>\n",
" <td>Disgust</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
" <td>Sadness</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I just came home, what the fuck is this lineup...</td>\n",
" <td>Disgust</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>By far the coolest thing I've seen on this thr...</td>\n",
" <td>Happiness</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38253</th>\n",
" <td>I just called the Capitol Police. They are not...</td>\n",
" <td>Anger</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38254</th>\n",
" <td>What a great photo and you two look so happy. 😍</td>\n",
" <td>Happiness</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38255</th>\n",
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
" <td>Happiness</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38256</th>\n",
" <td>Everyone likes [NAME].</td>\n",
" <td>Disgust</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38257</th>\n",
" <td>The FDA has plenty to criticize. But like here...</td>\n",
" <td>Anger</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>38258 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Text emotion\n",
"0 That game hurt. Sadness\n",
"1 Man I love reddit. Disgust\n",
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness\n",
"3 I just came home, what the fuck is this lineup... Disgust\n",
"4 By far the coolest thing I've seen on this thr... Happiness\n",
"... ... ...\n",
"38253 I just called the Capitol Police. They are not... Anger\n",
"38254 What a great photo and you two look so happy. 😍 Happiness\n",
"38255 Well, I'm glad you're out of all that now. How... Happiness\n",
"38256 Everyone likes [NAME]. Disgust\n",
"38257 The FDA has plenty to criticize. But like here... Anger\n",
"\n",
"[38258 rows x 2 columns]"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset3 = dataset3.dropna()\n",
"dataset3"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Text</th>\n",
" <th>emotion</th>\n",
" <th>emotion_label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>That game hurt.</td>\n",
" <td>Sadness</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Man I love reddit.</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
" <td>Sadness</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I just came home, what the fuck is this lineup...</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>By far the coolest thing I've seen on this thr...</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38253</th>\n",
" <td>I just called the Capitol Police. They are not...</td>\n",
" <td>Anger</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38254</th>\n",
" <td>What a great photo and you two look so happy. 😍</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38255</th>\n",
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38256</th>\n",
" <td>Everyone likes [NAME].</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38257</th>\n",
" <td>The FDA has plenty to criticize. But like here...</td>\n",
" <td>Anger</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>38258 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Text emotion \\\n",
"0 That game hurt. Sadness \n",
"1 Man I love reddit. Disgust \n",
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness \n",
"3 I just came home, what the fuck is this lineup... Disgust \n",
"4 By far the coolest thing I've seen on this thr... Happiness \n",
"... ... ... \n",
"38253 I just called the Capitol Police. They are not... Anger \n",
"38254 What a great photo and you two look so happy. 😍 Happiness \n",
"38255 Well, I'm glad you're out of all that now. How... Happiness \n",
"38256 Everyone likes [NAME]. Disgust \n",
"38257 The FDA has plenty to criticize. But like here... Anger \n",
"\n",
" emotion_label \n",
"0 4 \n",
"1 2 \n",
"2 4 \n",
"3 2 \n",
"4 1 \n",
"... ... \n",
"38253 0 \n",
"38254 1 \n",
"38255 1 \n",
"38256 2 \n",
"38257 0 \n",
"\n",
"[38258 rows x 3 columns]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"emotion_mapping = {\"Anger\":0,\"Happiness\":1,\"Disgust\":2,\"Fear\":3, \"Sadness\":4,\"Surprise\":5}\n",
"dataset3[\"emotion_label\"] = dataset3[\"emotion\"].map(emotion_mapping)\n",
"dataset3"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"dataset3['TextoLema']=preprocess_text_column(dataset3, 'Text')"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Text</th>\n",
" <th>emotion</th>\n",
" <th>emotion_label</th>\n",
" <th>TextoLema</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>That game hurt.</td>\n",
" <td>Sadness</td>\n",
" <td>4</td>\n",
" <td>game hurt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Man I love reddit.</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" <td>man love reddit</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
" <td>Sadness</td>\n",
" <td>4</td>\n",
" <td>happy sad s imagine team instead ugh</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I just came home, what the fuck is this lineup...</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" <td>come home fuck lineup love mad bastard</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>By far the coolest thing I've seen on this thr...</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" <td>far cool thing ve see thread</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38253</th>\n",
" <td>I just called the Capitol Police. They are not...</td>\n",
" <td>Anger</td>\n",
" <td>0</td>\n",
" <td>call capitol police affect shutdown fuck shit</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38254</th>\n",
" <td>What a great photo and you two look so happy. 😍</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" <td>great photo look happy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38255</th>\n",
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" <td>glad awful way act think healthy boundary hostile</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38256</th>\n",
" <td>Everyone likes [NAME].</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" <td>like</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38257</th>\n",
" <td>The FDA has plenty to criticize. But like here...</td>\n",
" <td>Anger</td>\n",
" <td>0</td>\n",
" <td>fda plenty criticize like usually criticize ho...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>38258 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Text emotion \\\n",
"0 That game hurt. Sadness \n",
"1 Man I love reddit. Disgust \n",
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness \n",
"3 I just came home, what the fuck is this lineup... Disgust \n",
"4 By far the coolest thing I've seen on this thr... Happiness \n",
"... ... ... \n",
"38253 I just called the Capitol Police. They are not... Anger \n",
"38254 What a great photo and you two look so happy. 😍 Happiness \n",
"38255 Well, I'm glad you're out of all that now. How... Happiness \n",
"38256 Everyone likes [NAME]. Disgust \n",
"38257 The FDA has plenty to criticize. But like here... Anger \n",
"\n",
" emotion_label TextoLema \n",
"0 4 game hurt \n",
"1 2 man love reddit \n",
"2 4 happy sad s imagine team instead ugh \n",
"3 2 come home fuck lineup love mad bastard \n",
"4 1 far cool thing ve see thread \n",
"... ... ... \n",
"38253 0 call capitol police affect shutdown fuck shit \n",
"38254 1 great photo look happy \n",
"38255 1 glad awful way act think healthy boundary hostile \n",
"38256 2 like \n",
"38257 0 fda plenty criticize like usually criticize ho... \n",
"\n",
"[38258 rows x 4 columns]"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset3 = dataset3.dropna(subset=['TextoLema'])\n",
"#dataset3 = dataset3[dataset3['TextoLema'].str.split().str.len() > 2]\n",
"dataset3"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"dataset3.to_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoemotionsEN.xlsx\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## PREPARAÇÃO DOS DADOS"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Text</th>\n",
" <th>emotion</th>\n",
" <th>emotion_label</th>\n",
" <th>TextoLema</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>That game hurt.</td>\n",
" <td>Sadness</td>\n",
" <td>4</td>\n",
" <td>game hurt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Man I love reddit.</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" <td>man love reddit</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
" <td>Sadness</td>\n",
" <td>4</td>\n",
" <td>happy sad s imagine team instead ugh</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I just came home, what the fuck is this lineup...</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" <td>come home fuck lineup love mad bastard</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>By far the coolest thing I've seen on this thr...</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" <td>far cool thing ve see thread</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38253</th>\n",
" <td>I just called the Capitol Police. They are not...</td>\n",
" <td>Anger</td>\n",
" <td>0</td>\n",
" <td>call capitol police affect shutdown fuck shit</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38254</th>\n",
" <td>What a great photo and you two look so happy. 😍</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" <td>great photo look happy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38255</th>\n",
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
" <td>Happiness</td>\n",
" <td>1</td>\n",
" <td>glad awful way act think healthy boundary hostile</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38256</th>\n",
" <td>Everyone likes [NAME].</td>\n",
" <td>Disgust</td>\n",
" <td>2</td>\n",
" <td>like</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38257</th>\n",
" <td>The FDA has plenty to criticize. But like here...</td>\n",
" <td>Anger</td>\n",
" <td>0</td>\n",
" <td>fda plenty criticize like usually criticize ho...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>38258 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Text emotion \\\n",
"0 That game hurt. Sadness \n",
"1 Man I love reddit. Disgust \n",
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness \n",
"3 I just came home, what the fuck is this lineup... Disgust \n",
"4 By far the coolest thing I've seen on this thr... Happiness \n",
"... ... ... \n",
"38253 I just called the Capitol Police. They are not... Anger \n",
"38254 What a great photo and you two look so happy. 😍 Happiness \n",
"38255 Well, I'm glad you're out of all that now. How... Happiness \n",
"38256 Everyone likes [NAME]. Disgust \n",
"38257 The FDA has plenty to criticize. But like here... Anger \n",
"\n",
" emotion_label TextoLema \n",
"0 4 game hurt \n",
"1 2 man love reddit \n",
"2 4 happy sad s imagine team instead ugh \n",
"3 2 come home fuck lineup love mad bastard \n",
"4 1 far cool thing ve see thread \n",
"... ... ... \n",
"38253 0 call capitol police affect shutdown fuck shit \n",
"38254 1 great photo look happy \n",
"38255 1 glad awful way act think healthy boundary hostile \n",
"38256 2 like \n",
"38257 0 fda plenty criticize like usually criticize ho... \n",
"\n",
"[38258 rows x 4 columns]"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset3=pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoemotionsEN.xlsx\")\n",
"dataset3"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"135"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset3['TextoLema'].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"dataset3 = dataset3.dropna(subset=['TextoLema'])"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"X=dataset3['TextoLema']\n",
"y=dataset3['emotion_label']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ddafd85f3262424b9f8c4e242a293b68",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/192 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Melhores parâmetros do CountVectorizer (menor esparsidade): (500, 1, 0.5, (1, 3))\n",
"Esparsidade: 98.23%\n"
]
}
],
"source": [
"vectorizer_params = {'max_features': [500, 1000, 2000, 5000],'min_df': [1, 2, 3, 5],'max_df': [0.5, 0.6, 0.7, 0.8],'ngram_range': [(1, 1), (1, 2), (1, 3)]}\n",
"combinations = list(product(vectorizer_params['max_features'], vectorizer_params['min_df'], vectorizer_params['max_df'], vectorizer_params['ngram_range']))\n",
"best_sparsity = float('inf')\n",
"best_params = {}\n",
"with tqdm(total=len(combinations)) as pbar:\n",
" for combo in combinations:\n",
" max_features, min_df, max_df, ngram_range = combo\n",
" vectorizer = CountVectorizer(max_features=max_features,min_df=min_df, max_df=max_df,ngram_range=ngram_range)\n",
" X_vec = vectorizer.fit_transform(dataset3['Text'])\n",
" sparsity = 100 * (1 - X_vec.nnz / (X_vec.shape[0] * X_vec.shape[1]))\n",
" if sparsity < best_sparsity:\n",
" best_sparsity = sparsity\n",
" best_params = combo\n",
" pbar.update(1)\n",
"print(f\"Melhores parâmetros do CountVectorizer (menor esparsidade): {best_params}\")\n",
"print(f\"Esparsidade: {best_sparsity:.2f}%\")"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Modelos//vectorizerVF.joblib']"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer = CountVectorizer(max_features = 500, min_df = 1, max_df = 0.5,ngram_range=(1,3))\n",
"X = vectorizer.fit_transform(dataset3['TextoLema']).toarray()\n",
"joblib.dump(vectorizer, 'Modelos//vectorizerVF.joblib')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"treino 60%, val=20% e teste=20%"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## REGRESSÃO LOGISTICA"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"undersample = RandomUnderSampler(random_state=42)\n",
"X_train_balanced, y_train_balanced = undersample.fit_resample(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f1578f07fc5643c9b02b67e18e2006f2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/84 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Melhores Parâmetros: {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}\n",
"Melhor accuracy: 0.6463089802130898\n"
]
}
],
"source": [
"param_grid = {\n",
" 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],\n",
" 'solver': ['lbfgs', 'liblinear', 'saga'],\n",
" 'class_weight': [None, 'balanced'],\n",
" 'max_iter': [500, 1000]\n",
"}\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
"grid_search = GridSearchCV(estimator=LogisticRegression(random_state=42), param_grid=param_grid,scoring='accuracy',cv=3, n_jobs=-1)\n",
"n_combinations = len(list(product(*param_grid.values()))) \n",
"with tqdm(total=n_combinations) as pbar: \n",
" grid_search.fit(X_train_scaled, y_train_balanced)\n",
" pbar.update(n_combinations) \n",
"print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
"print(\"Melhor accuracy:\", grid_search.best_score_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Melhores Parâmetros: {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}\n",
"\n",
"Melhor accuracy: 0.6601487361454299"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"classifier = LogisticRegression(C=0.001,solver='liblinear',max_iter=500, class_weight=None, random_state = 42)\n",
"classifier.fit(X_train_balanced,y_train_balanced)\n",
"y_pred = classifier.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Modelos//logistic_regression_model.joblib']"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joblib.dump(classifier, 'Modelos//logistic_regression_model.joblib')"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAHJCAYAAADkVRHSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx5klEQVR4nOzdd1yV1R/A8c+97I1sEBEEcc8cuXKV5irN0jQt907L1DTNbZp75fq5co+0TM2VI/fe4kJRUECG7A33+f1xE7sBhXIBje/79bov9HnOPc/33Pl9zjnPuSpFURSEEEIIIbKhLuwAhBBCCPHqkkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSROE/RNbOEq8ieV0K8XqTRKEANWq0CZVqJnXrbsixzMcf70Slmkm3bnteqO4TJx7TqtX2fy03fvwJVKqZL1R3bhw5EohKNZMjRwL1Ul90dDITJ56kcuXVWFnNw8npB5o23cLOnff0Un92rl0Lp1q1NZiYzKF8+ZV6qzclJZ2mTbdgZDQbA4NZGBrOwspqHnfuPM1z3d267cHTc5keoswdlWom48efyHX55cuvMmzYkcz/r159HZVqJg8exLx0DN267UGlmpnlZmk5j0qVVjN79vmXrvtV9qKPvRD6YljYARQ1arWK06eDefQoDnd3K519CQmpL/1F+L//XcXPL/Jfy/XqVZl33/V6qWMUlFu3ImnRYhsZGQpDhlSnShUnEhLSWL/ej/fe+5lJk+oxZkwdvR93woSTPHwYy88/v4+Tk7ne6lWpVMyb15jExHQMDdWYmBjg4WGNlZWx3o5RUE6d6pzldftPJk8+TaNGJTL/36pVKU6d6oyrq0We4nBxseDnn9/P/L+iQGhoAkuXXuGrr45gZmZI//5V83SMV82LPvZC6IskCgWsenUnbtyIZOvW23z5ZQ2dfTt33sfCwohixUzz7fju7lav9IdNWloGHTrsxMjIgDNnPsbJ6fkXyvvv+9Cnz36+/fYEbdp4U6WKk16PHRmZTKVKDrRsWUqv9RobG1CxoqNe6ywsb77plqf7Ozqa4+iY9yTMxMQg21haty5FqVLLWbXq+n8uUcjrYy/Ey5KhhwJmYWFEq1al2Lr1TpZ9mzff4sMPfTE01H1aIiISGTjwd0qWXIqx8Wzs7BbSrt0vmd233brt4ccfb/DwYSwq1UxWr77OgwcxqFQzmT37PGXLrsTcfC6rVl3TGXp4Via72791Zy9degVf3xWYmc2lYcNNPHwYm6VMYGAsnTrtws5uIebmc2nadAuXLj35x3p3777PtWsRTJ5cTydJeGbixHoMGlSN9HRN5rbz50N5992fsLdfiLX1fNq02c6NGxGZ+58Nixw8+JBmzbZibj4XF5dFfP31H2RkaOvRDpsEcfToo8zHMKdhmr93AW/ceJMqVX7EzGwujo4/0KXLboKD4zP3JyWlMWrUUUqXXo6JyRysrefzzjtbuXw5TKfeAwce0KDBRmxs5mNvv5DOnXcRFJT1cX0Z//YYgbYnp2XLbVhbz8fZeRGjRx+jR4+9NGq0Kce2z5t3gbJlV2JqOofixZcwYMABYmNTAPD0XMbDh7H8+OONzOGG7IYefvvtPvXqbcDCYi5ubovp1+8A0dHJL9VOIyMDLCyMUKl0ty9ffpUKFVZhYjIHD4+ljB9/IvO5f+bHH69Tvry2LVWq/MjBgw8xNJzF6tXXAe2wiaHhLJYvv4qLyyLs7Bbi56d9DHfs8KdGjbWYms7BxWURQ4YcIiEhNbPupKQ0Bgw4gLv7EkxM5lC27Epmzjync/x/eiyze+xDQuLp0WMvJUosxcxsLrVqrePXX/116lSpZrJo0SV69dqHnd1CrKzm0aHDrzx5kvBSj68omiRRKAQdO5bJHH54JjY2hT17AujUqZxOWUVRaNVqO/v3P+D77xuyf/+HjB9fl4MHA+nX7wAA335bh5YtvXBxseDUqc60avX8jHj8+JN8/XVN1q5tyTvveOrU7eqqLf/X29SpDQDo1atSjvEvXHiRfv0O0KpVKXbsaMubb7rSp88BnTIREYnUrbuBCxeesHBhUzZubI1Go/DWW5u4eTPnIZK9ex9gYKDK8azexcWCBQua8sYbLgAcPhxI3bobUBRYtepdli9vRlBQHHXrbuDWLd3jfPLJbho0cGfXrg/o3Lkc06efY/nya4C2W7daNSeqVXPK8hj+kxMnHtO162+0b1+aPXs+YM6cxhw8GEinTrsyy3z66R5WrrzOqFG12b//Q2bPbsSNGxF07rwrc6Lf2rU3aNbsJ0qUsGLjxtbMmdOYU6eCqVNnA2FheftQz81jFBGRyFtvbSIwMJZVq95l/vwm/PTTHTZsuJljvRs33mTEiKMMHFiVffs+ZOzYOqxd68fnnx8E4Oef38fFxYKWLb1yHG7YteserVtvx8nJnC1b2vD992/x88936dhxV5ayf5eersm8paSk8+BBDEOHHub27ad8+mmFzHJTp56hT5/9vP12SXbubMegQdX4/vtz9OmzP7PMmjU36NZtL/XqFWfHjrZ8+KEvbdv+QkaG7kTMjAyFWbPOs2JFc+bMaUS5cvZs2HCTtm1/oWxZO375pS3jx9dl7Vo/3n//l8zn94svDrNnTwAzZzZi3772vP++N8OH/8GqVddy9Vj+3ZMnCdSsuY6jRx/x3Xf12bbtPTw9rWnb9hfWr/fTKfvNN8fJyNCwaVNrZsxoyM6d9/nii8P/+vgK8YwMPRSCVq1KYWFhpDP88PPPd3FyMqd+/eI6ZUNCErCwMGLWrEbUr+8OQKNGHvj7R7Fs2VUAvL1tcXQ01+mOTUjQnrF16FCG7t2z/9I3MTHU6c68dy+amTPP88EHpRk9+s1s76MoCpMmnaZjxzLMmdMYgGbNPImNTWXJkiuZ5ebMuUBkZDInTnSiZEkbAFq08KJcuZWMHXuCrVvfy7b+oKBYHBzMsLTM3fj9yJFHKV26GL/99gEGBurMeLy9lzN27Am2bHl+nN69K/Ptt9q5DU2aePDLL/7s2nWPvn2r8Oabblhba4/5Il28x449wtzciK+/roWJifbtZG9vyrlzoSiKQlqahri4VBYsaEKHDmUBaNiwBLGxqXz11RGePEnEycmcESOO0ry5Jxs2tM6su1694pQvv4qZM88zfXrDXMf0Mo/R/PkXiYtL4/Llj3Bzs/zzcXDF1zfnSZ1//PEILy8bBg6shlqtomHDElhaGvH0qbY3oFo1Z0xMDHB0NM/xMR0//iRVqzqxffv7qP7sBjA2NmDs2BM8eZKAs3P2cxkePozFyGh2lu2lSxdj0aK36devCgAxMSlMmnSKvn2rMG9ek8y229ub0avXPoYOrUGFCg58++1x2rTx5n//aw5A8+ZeGBmpGTXqWJZjjB79Jq1aeQPa98PXXx/l3Xc9WbeulU4cb7+9ld9+u0+rVt788ccj3nnHk48/1r4GGjXywNLSOHMuzL89ln83e/Z5wsOTuHOnR+b7q2XLUrz99haGDfuDTp3KoVZrH89KlRxYtapF5n3Png1l69bb2dYrRHakR6EQmJkZ0aaNt87ww6ZNt+jYsUzmh+Uzbm6WHDrUkXr1ivPgQQwHDjxgwYKLnDgRTEpKxr8eq2rV3I2Nx8am8N57P+PiYs6PP7bIEsczt28/JSwskTZtvHW2d+hQRuf/Bw8GUrWqI8WLW2We9anVKlq08OLAgYc5xmFoqM5yFpeThIRUzp0LpUOHMplfgAC2tqa0aePNkSNBOuXr1NH9snJ3tyIhIS1Xx8pJw4YlSEhIo2LF1YwadZRjxx7RrJknY8fWRaVSYWxswN69H9KhQ1keP47j8OFAli69wq5d2kmrKSnp3L79lNDQBDp1KqtTt7e3LXXquGZpx4vI7WN06FAQdeu6ZSYJACVL2lC3bs5JU+PGJbh9+ylvvLGWiRNPcv58KJ07l+Pzz6vnKrakpDQuXnxCu3aldV5vHTuW5fbtnjkmCaDtDTt3rgvnznVh//4PadDAHTc3S378sQX9+1fNrO/UqWC
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cm = confusion_matrix(y_test, y_pred)\n",
"CMatrix = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)\n",
"CMatrix.plot()\n",
"plt.title('Matriz de Confusão Logistic Regression', color=\"darkblue\")\n",
"plt.xlabel('Previsões', color=\"darkblue\")\n",
"plt.ylabel('Real', color=\"darkblue\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.68 0.49 0.57 1565\n",
" 1 0.73 0.55 0.63 1578\n",
" 2 0.77 0.78 0.77 1534\n",
" 3 0.26 0.59 0.36 584\n",
" 4 0.65 0.55 0.60 1349\n",
" 5 0.46 0.58 0.51 1015\n",
"\n",
" accuracy 0.59 7625\n",
" macro avg 0.59 0.59 0.57 7625\n",
"weighted avg 0.64 0.59 0.60 7625\n",
"\n"
]
}
],
"source": [
"classes = np.unique(y)\n",
"print(classification_report(y_test.tolist(), y_pred.tolist(), labels=classes.tolist()))"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "75d04b694d4247f2a8297581f2b64cc9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/13122 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[101], line 17\u001b[0m\n\u001b[0;32m 15\u001b[0m n_combinations \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mlist\u001b[39m(product(\u001b[38;5;241m*\u001b[39mparam_grid\u001b[38;5;241m.\u001b[39mvalues()))) \n\u001b[0;32m 16\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m tqdm(total\u001b[38;5;241m=\u001b[39mn_combinations) \u001b[38;5;28;01mas\u001b[39;00m pbar: \n\u001b[1;32m---> 17\u001b[0m \u001b[43mgrid_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_scaled\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train_balanced\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 18\u001b[0m pbar\u001b[38;5;241m.\u001b[39mupdate(n_combinations)\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMelhores Parâmetros:\u001b[39m\u001b[38;5;124m\"\u001b[39m, grid_search\u001b[38;5;241m.\u001b[39mbest_params_)\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:1019\u001b[0m, in \u001b[0;36mBaseSearchCV.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 1013\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_results(\n\u001b[0;32m 1014\u001b[0m all_candidate_params, n_splits, all_out, all_more_results\n\u001b[0;32m 1015\u001b[0m )\n\u001b[0;32m 1017\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m-> 1019\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluate_candidates\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1021\u001b[0m \u001b[38;5;66;03m# multimetric is determined here because in the case of a callable\u001b[39;00m\n\u001b[0;32m 1022\u001b[0m \u001b[38;5;66;03m# self.scoring the return type is only known after calling\u001b[39;00m\n\u001b[0;32m 1023\u001b[0m first_test_score \u001b[38;5;241m=\u001b[39m all_out[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_scores\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:1573\u001b[0m, in \u001b[0;36mGridSearchCV._run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1571\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_run_search\u001b[39m(\u001b[38;5;28mself\u001b[39m, evaluate_candidates):\n\u001b[0;32m 1572\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Search all candidates in param_grid\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1573\u001b[0m \u001b[43mevaluate_candidates\u001b[49m\u001b[43m(\u001b[49m\u001b[43mParameterGrid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparam_grid\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:965\u001b[0m, in \u001b[0;36mBaseSearchCV.fit.<locals>.evaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 958\u001b[0m \u001b[38;5;28mprint\u001b[39m(\n\u001b[0;32m 959\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFitting \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m folds for each of \u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m candidates,\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 960\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m totalling \u001b[39m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m fits\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 961\u001b[0m n_splits, n_candidates, n_candidates \u001b[38;5;241m*\u001b[39m n_splits\n\u001b[0;32m 962\u001b[0m )\n\u001b[0;32m 963\u001b[0m )\n\u001b[1;32m--> 965\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 966\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 967\u001b[0m \u001b[43m \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase_estimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 968\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 970\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 971\u001b[0m \u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_splits\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mcandidate_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_candidates\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_and_score_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mproduct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcandidate_params\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"param_grid = {\n",
" 'n_estimators': [50, 100, 200],\n",
" 'learning_rate': [0.01, 0.1, 0.2],\n",
" 'max_depth': [3, 5, 10],\n",
" 'subsample': [0.8, 1.0],\n",
" 'colsample_bytree': [0.6, 0.8, 1.0],\n",
" 'gamma': [0, 0.1, 0.5],\n",
" 'min_child_weight': [1, 3, 5],\n",
" 'reg_alpha': [0, 0.01, 0.1],\n",
" 'reg_lambda': [1, 1.5, 2]\n",
"}\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
"grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, tree_method='gpu_hist'),param_grid=param_grid,scoring='accuracy',cv=2, n_jobs=-1)\n",
"n_combinations = len(list(product(*param_grid.values()))) \n",
"with tqdm(total=n_combinations) as pbar: \n",
" grid_search.fit(X_train_scaled, y_train_balanced)\n",
" pbar.update(n_combinations)\n",
"print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
"print(\"Melhor accuracy:\", grid_search.best_score_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Melhores Parâmetros: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}\n",
"\n",
"Melhor accuracy: 0.6592488954270308"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\garci\\anaconda32\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [15:50:34] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n",
"Parameters: { \"gama\" } are not used.\n",
"\n",
" warnings.warn(smsg, UserWarning)\n"
]
}
],
"source": [
"xgb = XGBClassifier(colsample_bytree=0.8, gama=0.1, learning_rate= 0.1, max_depth=5,min_child_weight=1, n_estimators=100, subsample=0.8, reg_alpha=0, reg_lambda=1)\n",
"xgb.fit(X_train_scaled,y_train_balanced)\n",
"previsoesxgb = xgb.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Modelos//logistic_regression_model.joblib']"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joblib.dump(xgb, 'Modelos//logistic_regression_model.joblib')"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAHJCAYAAADkVRHSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx5klEQVR4nOzdd1yV1R/A8c+97I1sEBEEcc8cuXKV5irN0jQt907L1DTNbZp75fq5co+0TM2VI/fe4kJRUECG7A33+f1xE7sBhXIBje/79bov9HnOPc/33Pl9zjnPuSpFURSEEEIIIbKhLuwAhBBCCPHqkkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSROE/RNbOEq8ieV0K8XqTRKEANWq0CZVqJnXrbsixzMcf70Slmkm3bnteqO4TJx7TqtX2fy03fvwJVKqZL1R3bhw5EohKNZMjRwL1Ul90dDITJ56kcuXVWFnNw8npB5o23cLOnff0Un92rl0Lp1q1NZiYzKF8+ZV6qzclJZ2mTbdgZDQbA4NZGBrOwspqHnfuPM1z3d267cHTc5keoswdlWom48efyHX55cuvMmzYkcz/r159HZVqJg8exLx0DN267UGlmpnlZmk5j0qVVjN79vmXrvtV9qKPvRD6YljYARQ1arWK06eDefQoDnd3K519CQmpL/1F+L//XcXPL/Jfy/XqVZl33/V6qWMUlFu3ImnRYhsZGQpDhlSnShUnEhLSWL/ej/fe+5lJk+oxZkwdvR93woSTPHwYy88/v4+Tk7ne6lWpVMyb15jExHQMDdWYmBjg4WGNlZWx3o5RUE6d6pzldftPJk8+TaNGJTL/36pVKU6d6oyrq0We4nBxseDnn9/P/L+iQGhoAkuXXuGrr45gZmZI//5V83SMV82LPvZC6IskCgWsenUnbtyIZOvW23z5ZQ2dfTt33sfCwohixUzz7fju7lav9IdNWloGHTrsxMjIgDNnPsbJ6fkXyvvv+9Cnz36+/fYEbdp4U6WKk16PHRmZTKVKDrRsWUqv9RobG1CxoqNe6ywsb77plqf7Ozqa4+iY9yTMxMQg21haty5FqVLLWbXq+n8uUcjrYy/Ey5KhhwJmYWFEq1al2Lr1TpZ9mzff4sMPfTE01H1aIiISGTjwd0qWXIqx8Wzs7BbSrt0vmd233brt4ccfb/DwYSwq1UxWr77OgwcxqFQzmT37PGXLrsTcfC6rVl3TGXp4Via72791Zy9degVf3xWYmc2lYcNNPHwYm6VMYGAsnTrtws5uIebmc2nadAuXLj35x3p3777PtWsRTJ5cTydJeGbixHoMGlSN9HRN5rbz50N5992fsLdfiLX1fNq02c6NGxGZ+58Nixw8+JBmzbZibj4XF5dFfP31H2RkaOvRDpsEcfToo8zHMKdhmr93AW/ceJMqVX7EzGwujo4/0KXLboKD4zP3JyWlMWrUUUqXXo6JyRysrefzzjtbuXw5TKfeAwce0KDBRmxs5mNvv5DOnXcRFJT1cX0Z//YYgbYnp2XLbVhbz8fZeRGjRx+jR4+9NGq0Kce2z5t3gbJlV2JqOofixZcwYMABYmNTAPD0XMbDh7H8+OONzOGG7IYefvvtPvXqbcDCYi5ubovp1+8A0dHJL9VOIyMDLCyMUKl0ty9ffpUKFVZhYjIHD4+ljB9/IvO5f+bHH69Tvry2LVWq/MjBgw8xNJzF6tXXAe2wiaHhLJYvv4qLyyLs7Bbi56d9DHfs8KdGjbWYms7BxWURQ4YcIiEhNbPupKQ0Bgw4gLv7EkxM5lC27Epmzjync/x/eiyze+xDQuLp0WMvJUosxcxsLrVqrePXX/116lSpZrJo0SV69dqHnd1CrKzm0aHDrzx5kvBSj68omiRRKAQdO5bJHH54JjY2hT17AujUqZxOWUVRaNVqO/v3P+D77xuyf/+HjB9fl4MHA+nX7wAA335bh5YtvXBxseDUqc60avX8jHj8+JN8/XVN1q5tyTvveOrU7eqqLf/X29SpDQDo1atSjvEvXHiRfv0O0KpVKXbsaMubb7rSp88BnTIREYnUrbuBCxeesHBhUzZubI1Go/DWW5u4eTPnIZK9ex9gYKDK8azexcWCBQua8sYbLgAcPhxI3bobUBRYtepdli9vRlBQHHXrbuDWLd3jfPLJbho0cGfXrg/o3Lkc06efY/nya4C2W7daNSeqVXPK8hj+kxMnHtO162+0b1+aPXs+YM6cxhw8GEinTrsyy3z66R5WrrzOqFG12b//Q2bPbsSNGxF07rwrc6Lf2rU3aNbsJ0qUsGLjxtbMmdOYU6eCqVNnA2FheftQz81jFBGRyFtvbSIwMJZVq95l/vwm/PTTHTZsuJljvRs33mTEiKMMHFiVffs+ZOzYOqxd68fnnx8E4Oef38fFxYKWLb1yHG7YteserVtvx8nJnC1b2vD992/x88936dhxV5ayf5eersm8paSk8+BBDEOHHub27ad8+mmFzHJTp56hT5/9vP12SXbubMegQdX4/vtz9OmzP7PMmjU36NZtL/XqFWfHjrZ8+KEvbdv+QkaG7kTMjAyFWbPOs2JFc+bMaUS5cvZs2HCTtm1/oWxZO375pS3jx9dl7Vo/3n//l8zn94svDrNnTwAzZzZi3772vP++N8OH/8GqVddy9Vj+3ZMnCdSsuY6jRx/x3Xf12bbtPTw9rWnb9hfWr/fTKfvNN8fJyNCwaVNrZsxoyM6d9/nii8P/+vgK8YwMPRSCVq1KYWFhpDP88PPPd3FyMqd+/eI6ZUNCErCwMGLWrEbUr+8OQKNGHvj7R7Fs2VUAvL1tcXQ01+mOTUjQnrF16FCG7t2z/9I3MTHU6c68dy+amTPP88EHpRk9+s1s76MoCpMmnaZjxzLMmdMYgGbNPImNTWXJkiuZ5ebMuUBkZDInTnSiZEkbAFq08KJcuZWMHXuCrVvfy7b+oKBYHBzMsLTM3fj9yJFHKV26GL/99gEGBurMeLy9lzN27Am2bHl+nN69K/Ptt9q5DU2aePDLL/7s2nWPvn2r8Oabblhba4/5Il28x449wtzciK+/roWJifbtZG9vyrlzoSiKQlqahri4VBYsaEKHDmUBaNiwBLGxqXz11RGePEnEycmcESOO0ry5Jxs2tM6su1694pQvv4qZM88zfXrDXMf0Mo/R/PkXiYtL4/Llj3Bzs/zzcXDF1zfnSZ1//PEILy8bBg6shlqtomHDElhaGvH0qbY3oFo1Z0xMDHB0NM/xMR0//iRVqzqxffv7qP7sBjA2NmDs2BM8eZKAs3P2cxkePozFyGh2lu2lSxdj0aK36devCgAxMSlMmnSKvn2rMG9ek8y229ub0avXPoYOrUGFCg58++1x2rTx5n//aw5A8+ZeGBmpGTXqWJZjjB79Jq1aeQPa98PXXx/l3Xc9WbeulU4cb7+9ld9+u0+rVt788ccj3nnHk48/1r4GGjXywNLSOHMuzL89ln83e/Z5wsOTuHOnR+b7q2XLUrz99haGDfuDTp3KoVZrH89KlRxYtapF5n3Png1l69bb2dYrRHakR6EQmJkZ0aaNt87ww6ZNt+jYsUzmh+Uzbm6WHDrUkXr1ivPgQQwHDjxgwYKLnDgRTEpKxr8eq2rV3I2Nx8am8N57P+PiYs6PP7bIEsczt28/JSwskTZtvHW2d+hQRuf/Bw8GUrWqI8WLW2We9anVKlq08OLAgYc5xmFoqM5yFpeThIRUzp0LpUOHMplfgAC2tqa0aePNkSNBOuXr1NH9snJ3tyIhIS1Xx8pJw4YlSEhIo2LF1YwadZRjxx7RrJknY8fWRaVSYWxswN69H9KhQ1keP47j8OFAli69wq5d2kmrKSnp3L79lNDQBDp1KqtTt7e3LXXquGZpx4vI7WN06FAQdeu6ZSYJACVL2lC3bs5JU+PGJbh9+ylvvLGWiRNPcv58KJ07l+Pzz6vnKrakpDQuXnxCu3aldV5vHTuW5fbtnjkmCaDtDTt3rgvnznVh//4PadDAHTc3S378sQX9+1fNrO/UqWC
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cm2 = confusion_matrix(y_test, y_pred)\n",
"CMatrix = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=xgb.classes_)\n",
"CMatrix.plot()\n",
"plt.title('Matriz de Confusão Logistic Regression', color=\"darkblue\")\n",
"plt.xlabel('Previsões', color=\"darkblue\")\n",
"plt.ylabel('Real', color=\"darkblue\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.52 0.77 0.62 1565\n",
" 1 0.77 0.58 0.66 1578\n",
" 2 0.78 0.82 0.80 1534\n",
" 3 0.66 0.50 0.57 584\n",
" 4 0.71 0.58 0.64 1349\n",
" 5 0.61 0.56 0.58 1015\n",
"\n",
" accuracy 0.66 7625\n",
" macro avg 0.67 0.64 0.65 7625\n",
"weighted avg 0.68 0.66 0.66 7625\n",
"\n"
]
}
],
"source": [
"\n",
"print(classification_report(y_test, previsoesxgb))"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "65bf7c3e26fe4a3eb4a49169477f0d65",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Melhores Parâmetros: {'var_smoothing': 1e-05}\n",
"Melhor accuracy: 0.31687985071238955\n"
]
}
],
"source": [
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
"param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}\n",
"model = GaussianNB()\n",
"grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)\n",
"n_combinations = len(list(product(*param_grid.values()))) \n",
"with tqdm(total=n_combinations) as pbar: \n",
" grid_search.fit(X_train_scaled, y_train)\n",
" pbar.update(n_combinations)\n",
"print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
"print(\"Melhor accuracy:\", grid_search.best_score_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sc=StandardScaler()\n",
"x_treino2=sc.fit_transform(X_train_scaled)\n",
"x_teste2=sc.transform(X_test)\n",
"naive=GaussianNB(var_smoothing= 1e-05)\n",
"naive.fit(x_treino2, y_train_balanced)\n",
"previsoesnb = naive.predict(x_teste2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}