1519 lines
228 KiB
Plaintext
1519 lines
228 KiB
Plaintext
|
|
{
|
|||
|
|
"cells": [
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## BIBLIOTECA"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 62,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"import seaborn as sns\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"from matplotlib import pyplot as plt\n",
|
|||
|
|
"import statistics as st\n",
|
|||
|
|
"from statistics import mode, mean\n",
|
|||
|
|
"from scipy.stats import spearmanr, pearsonr, skew, kendalltau, norm\n",
|
|||
|
|
"from collections import Counter\n",
|
|||
|
|
"from datetime import datetime\n",
|
|||
|
|
"from keras.models import load_model\n",
|
|||
|
|
"import warnings\n",
|
|||
|
|
"#!pip install wordcloud\n",
|
|||
|
|
"import string\n",
|
|||
|
|
"from wordcloud import WordCloud\n",
|
|||
|
|
"from xgboost import XGBClassifier\n",
|
|||
|
|
"import nltk\n",
|
|||
|
|
"import torch\n",
|
|||
|
|
"from torch import nn\n",
|
|||
|
|
"from torch.utils.data import DataLoader, Dataset\n",
|
|||
|
|
"#nltk.download('all')\n",
|
|||
|
|
"import spacy\n",
|
|||
|
|
"import pickle \n",
|
|||
|
|
"from sklearn.datasets import load_files, make_classification\n",
|
|||
|
|
"from docx import Document\n",
|
|||
|
|
"import os\n",
|
|||
|
|
"import re\n",
|
|||
|
|
"import pdfplumber\n",
|
|||
|
|
"import gensim\n",
|
|||
|
|
"#!pip install gensim\n",
|
|||
|
|
"from gensim.models import FastText\n",
|
|||
|
|
"from nltk.corpus import stopwords\n",
|
|||
|
|
"from joblib import Parallel, delayed\n",
|
|||
|
|
"from nltk.tokenize import word_tokenize\n",
|
|||
|
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
|
"from textblob import TextBlob\n",
|
|||
|
|
"from deep_translator import GoogleTranslator\n",
|
|||
|
|
"from pyannote.audio import Pipeline\n",
|
|||
|
|
"from transformers import pipeline\n",
|
|||
|
|
"from nltk.stem import WordNetLemmatizer\n",
|
|||
|
|
"from reportlab.lib.pagesizes import A4\n",
|
|||
|
|
"from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Image\n",
|
|||
|
|
"from reportlab.lib import colors\n",
|
|||
|
|
"from PIL import Image, ImageOps\n",
|
|||
|
|
"from tkinter import filedialog\n",
|
|||
|
|
"from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid\n",
|
|||
|
|
"import tabula\n",
|
|||
|
|
"import PyPDF2\n",
|
|||
|
|
"import pdfplumber\n",
|
|||
|
|
"from datetime import datetime\n",
|
|||
|
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|||
|
|
"from sklearn.naive_bayes import GaussianNB\n",
|
|||
|
|
"from sklearn.svm import SVC\n",
|
|||
|
|
"from imblearn.combine import SMOTEENN\n",
|
|||
|
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|||
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
|
"import joblib\n",
|
|||
|
|
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
|
|||
|
|
"import fitz\n",
|
|||
|
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
|
"from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, classification_report, f1_score, precision_score, recall_score, roc_auc_score\n",
|
|||
|
|
"from tqdm.notebook import tqdm\n",
|
|||
|
|
"from tensorflow.keras.models import Sequential, load_model\n",
|
|||
|
|
"from tensorflow.keras.optimizers import Adam\n",
|
|||
|
|
"from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Flatten, Dropout, BatchNormalization\n",
|
|||
|
|
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
|||
|
|
"from keras.utils import to_categorical\n",
|
|||
|
|
"from itertools import product\n",
|
|||
|
|
"#!python -m spacy download pt_core_news_sm"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## FUNÇÕES"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 63,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def grafico(a):\n",
|
|||
|
|
" if a == 1:\n",
|
|||
|
|
" custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False}\n",
|
|||
|
|
" sns.set_theme(style=\"ticks\", rc=custom_params, palette = \"pastel\")\n",
|
|||
|
|
" palette = \"pastel\"\n",
|
|||
|
|
" elif a == 2:\n",
|
|||
|
|
" sns.set_theme(style=\"white\", palette = \"Set2\")\n",
|
|||
|
|
" palette = \"Set2\"\n",
|
|||
|
|
" elif a == 3:\n",
|
|||
|
|
" sns.set_theme(style=\"whitegrid\", palette = \"pastel\")\n",
|
|||
|
|
" palette = \"pastel\"\n",
|
|||
|
|
" elif a == 4:\n",
|
|||
|
|
" sns.set_theme()\n",
|
|||
|
|
" palette = \"husl\"\n",
|
|||
|
|
" return palette"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Tratamento para portugues"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 64,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
|
|||
|
|
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
|
|||
|
|
"C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\3345901888.py:8: SyntaxWarning: invalid escape sequence '\\s'\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '')\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"portuguesstopwords = set(stopwords.words('portuguese'))\n",
|
|||
|
|
"nlp = spacy.load(\"pt_core_news_sm\")\n",
|
|||
|
|
"def preprocess_text_column_pt(df, column_name):\n",
|
|||
|
|
" if column_name not in df.columns:\n",
|
|||
|
|
" raise ValueError(f\"A coluna '{column_name}' não existe no dataset.\")\n",
|
|||
|
|
" df_copy = df.copy()\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.lower() \n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '') \n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.strip() \n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True) \n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in portuguesstopwords]))\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True) \n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True) \n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))\n",
|
|||
|
|
" df_copy['TextoLema'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)])) \n",
|
|||
|
|
"\n",
|
|||
|
|
" return df_copy['TextoLema']"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Tratamento para ingles"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 65,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
|
|||
|
|
"<>:8: SyntaxWarning: invalid escape sequence '\\s'\n",
|
|||
|
|
"C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\4039191732.py:8: SyntaxWarning: invalid escape sequence '\\s'\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"english_stopwords = set(stopwords.words('english'))\n",
|
|||
|
|
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
|||
|
|
"def preprocess_text_column(df, column_name):\n",
|
|||
|
|
" if column_name not in df.columns:\n",
|
|||
|
|
" raise ValueError(f\"The column '{column_name}' does not exist in the dataset.\")\n",
|
|||
|
|
" df_copy = df.copy()\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.lower()\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True).str.strip()\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True)\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True)\n",
|
|||
|
|
" df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in english_stopwords and len(word) > 2]))\n",
|
|||
|
|
" df_copy['LemmatizedText'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_punct and not token.is_stop]))\n",
|
|||
|
|
" return df_copy['LemmatizedText']"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 66,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
" Text emotion\n",
|
|||
|
|
"0 That game hurt. Sadness\n",
|
|||
|
|
"1 Man I love reddit. Disgust\n",
|
|||
|
|
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness\n",
|
|||
|
|
"3 I just came home, what the fuck is this lineup... Disgust\n",
|
|||
|
|
"4 By far the coolest thing I've seen on this thr... Happiness\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"colunas = [\"Text\", \"emotion\"]\n",
|
|||
|
|
"dataset3 = pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoEmotions_parateste.xlsx\",names=colunas)\n",
|
|||
|
|
"print(dataset3.head())\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 67,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"array(['Sadness', 'Disgust', 'Happiness', 'Surprise', 'Anger', 'Fear'],\n",
|
|||
|
|
" dtype=object)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 67,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3[\"emotion\"].unique()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 68,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>V. Absolutos</th>\n",
|
|||
|
|
" <th>V. Relativos (%)</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>emotion</th>\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Happiness</th>\n",
|
|||
|
|
" <td>7907</td>\n",
|
|||
|
|
" <td>20.67</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Anger</th>\n",
|
|||
|
|
" <td>7844</td>\n",
|
|||
|
|
" <td>20.50</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Disgust</th>\n",
|
|||
|
|
" <td>7678</td>\n",
|
|||
|
|
" <td>20.07</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Sadness</th>\n",
|
|||
|
|
" <td>6758</td>\n",
|
|||
|
|
" <td>17.66</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Surprise</th>\n",
|
|||
|
|
" <td>5144</td>\n",
|
|||
|
|
" <td>13.45</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Fear</th>\n",
|
|||
|
|
" <td>2927</td>\n",
|
|||
|
|
" <td>7.65</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" V. Absolutos V. Relativos (%)\n",
|
|||
|
|
"emotion \n",
|
|||
|
|
"Happiness 7907 20.67\n",
|
|||
|
|
"Anger 7844 20.50\n",
|
|||
|
|
"Disgust 7678 20.07\n",
|
|||
|
|
"Sadness 6758 17.66\n",
|
|||
|
|
"Surprise 5144 13.45\n",
|
|||
|
|
"Fear 2927 7.65"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 68,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"pd.DataFrame({'V. Absolutos':dataset3['emotion'].value_counts(), 'V. Relativos (%)':(dataset3['emotion'].value_counts()*100/dataset3.shape[0]).round(2)})"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 69,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\1100090595.py:3: FutureWarning: \n",
|
|||
|
|
"\n",
|
|||
|
|
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
|
|||
|
|
"\n",
|
|||
|
|
" sns.countplot(x=\"emotion\", data=dataset3,order= dataset3['emotion'].value_counts().index,palette=\"husl\")\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA9gAAAJICAYAAACaO0yGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACEgElEQVR4nOzdd1yV5f/H8fdhgwoKqLhw4EDNmTgqR66y1MRRZo4cZaWpmZqluVeKo1Ib7kpz5EgcaWnO1NScqZnmVkRBxcE8h98f/DzfTjgQbzwcfD0fDx/BdV/3dX8OXAHvc9/XfZuSk5OTBQAAAAAAHoqTvQsAAAAAACArIGADAAAAAGAAAjYAAAAAAAYgYAMAAAAAYAACNgAAAAAABiBgAwAAAABgAAI2AAAAAAAGIGADAAAAAGAAAjYAAHeRnJxs7xIAAIADIWADABxSnTrzZTKFWf85OYUpR45PVaXKt/rssz+UlGSx6V+kyNd6/fXVaR5/+fJj6tDh/v1ff321ihT5Ot3HuZvZsw/KZArTyZPX0rzPDz/8JX//KXJyCpOLy3g5O4/Xm2+ufeha7GHDhtM23987/fvppxP2LlOSFBubqGefXWD9mru4jJePz2c6ceKqvUsDADxiLvYuAACA9KpUKY+mTq0vSTKbkxUdHafVq//Re+/9qs2bz2rBgiZycjJJkpYufUne3u5pHnvChF1p6vfxxzXUs2flBy/+Pl58sZi2bWujfPmypXmf6tXz65dfWikx0Sw3N2d5e7upaNGchtf2KE2ZUk+VK+e947bSpf0ecTV35uRk0uef19WtW0lycXGSu7uzChf2VvbsbvYuDQDwiBGwAQAOy9vbTdWr57dpa9IkSMHBfurZc72+//6wXnutjCSpUqU7h7SHFRSUM0PGzZ3bS7lzez3QPgUL5lDBgjkypB57KVPGL9X3OLNxd3fRE0/ktncZAIBMgEvEAQBZTvfulVSgQHZ9+eU+a9t/L93+/vvDqlBhjjw9Jyl37ilq23alzp+/ISnl8vONG89q48azMpnCtGHDaesly199tU+FC38lb+/P9PPPJ1NdIi5JiYkW9eixTrlyfa6cOT9Xhw6rdOnSLev2OnXmq06d+Tb73B5/w4bTku58ifiqVf/o6afnKVu2Scqf/wu9/fbPunYt3rp906Yzeu65H5Qr1+dyc5ugokW/1pAhW2Wx/G8t+bVr8erd+1cFBU2Th8dEPfHELM2ceeCeX8/bta1de1K1as2Xp+cklSgxXV98sdemX1xckoYP36bg4Jny8JioEiWm65NPdtgcv06d+WrbdqVatvxR2bJNUoMGi+557LSYPfugPDwmasuWswoJ+VYeHhNVqtQMhYcf119/RatevYXy8pqk4sWna/78Izb7/v33FbVs+aMCAqYqW7ZJevbZBdq69ZxNn5iYeL377joVKPClsmWbpJCQb7Vy5XHrdrPZoqlT96hcudny9JykwMCv1L//JsXFJdmMs3nzWdWuPV9eXpPk6zs51bywWJI1cOAWFS36tdzdJ6po0a/14YeblJhofuivEQDg0SBgAwCyHCcnk+rVC9SOHRdSrcWWpK1bz6ldu1Vq0aKEVq9urokTn9W6daf16qsrJElTp9ZXpUp5VKlSHm3b1sbmEuWhQ3/T+PF1NGVKPT311J3PrC5YcER//BGpOXMaKSystlauPKEXXlgsszl1LWm1YsVxNW68RHnzZtOiRU31ySe1tGTJ39aa9+2LVL16i+Tv76kFC5ooPDxUNWsW1NCh27Rw4V+SUtYKP/PM95o797D69auqH39sppo1C6pz5zUaNWr7fWt45ZVwVa6cR8uWvaQGDQrrnXd+sYbs5ORkNWmyVGPH/q4uXcopPDxUrVqV0oABW/TWWz//5+vzl3LkcNPy5aHq1y/knsc0m5OVlGRJ9e+/X8vERItefXWlunatoOXLQ+Xl5arXXlupxo2X6MUXiyk8PFT582dXhw6rdfbsdUnSoUOX9eST3+rkyRh9/nk9zZvXWCaT9OyzC7Rx45n/P75FDRv+oLlzD+ujj6pp+fJQBQf7qVmzH/XbbylBvGvXn9Wr168KDS2u5cubqXv3Svr88z/00kvLrDfK27TpzP8HfRctXNhEkyY9qw0bzujZZxcoNjZRkvTJJ79r6tS9GjSohtaubam3366oceN2asSI+39vAACZA5eIAwCypICAbEpMtCgqKlZ589quY968+ay8vFz1wQdV5e6e8qvQz89DO3dGKDk5WWXK+MvbO2X97H8vT37nnYpq2bLUPY/t7++pNWtaKFu2lDFy5/ZSs2bLtHr1CTVuHJSu1zNkyG+qWDGPFi9uKpMpZV25m5uzBg3aqsuXb2n//ktq0KCwvv32Beu68wYNimj58uPasOGMWrcO1uzZf+rgwcv67bc2qlEj5XU991xRJSZaNHz4dr31VgX5+nretYbmzUto0qS61v3On7+p4cO36a23Kuinn07ol19O6fvvG6t162Dr8b28XPTxx1vVs2dllS3r//91O+nLLxtYv/b3Ur/+nc9wly3rp4MHO1o/t1iSNWBANXXpUl6SdOVKnFq3XqFevSqrd+8qkqScOd1Vpcp32rUrQgUL5tDQodvk7u6sX399RTlypHyvXnyxmJ54Yrb69t2o339vq9WrT2jHjgtatqyZXnqpuCTp2WcDdfz4Va1ff1o5c7prxowDGj26pvr3r2Z93fnzZ1e7dqu0evUJvfBCMX344WaVKuWrFSuay9k55fxG9er5VKbMLM2ceVDdulXSxo1nVKVKXnXsWE6SVLt2IXl5uShnTo/7fp0AAJkDZ7ABAFnS7Sds/X8WtVG7diHdvJmoJ56YrQ8/3KTNm8+qYcMiGjToKWt4vZuKFfPc99gvvljMGq6llHXhLi5O2rTp7AO9httiYxP1xx8XFRpawqa+V14J1l9/dZa/v5fatSurVataKCHBrP37L2nx4qMaPHirkpIsio9PuVR5w4YzKlLE2xqub2vbtrTi4pK0ffuFe9bRoUNZm89btCihCxdu6ujRK9qw4YxcXJzUqlXJ/4ydsgb+9hlhKeXmZGkJ15L05ZcNtHNn21T/FixokqrvU08VsH6cN2/K+vVq1fJZ2/z8Ut48uHo15bL6DRvOqHHjYtZwLUkuLk5q3bqUdu2K0I0bCdqy5ZxcXZ3UpMn/3hhxcjLpt9/aaODAGtq4MeV7+uqrwTa1tG4dLGdnkzZsOKNbtxK1ffsFvfhiMSUny3oWvlixnCpd2k8//3xKkvTss4X088+nVLPm9xo37ncdOnRZ3btXtn4NAQCZH2ewAQBZ0tmz1+Xp6WINVf9Wo0Z+rVrVXBMm7NaECbs1ZszvypvXSwMGVNe77977juDZs7ve99gBAbZnzJ2cTPL399SVK3EP9iL+X3R0nJKTpTx57n7Ts9jYRL377np9++0hJSaaVbSoj556qoBcXZ2sbzZER8elqu3f9d4OnndToIDtDdRu1xMdHafo6Dj5+3taz87ea+y0fA1vK1Uql6pUCUhT39tXHfxbtmx3P9a9vh7JyVJMTIKiomLl5+dpvSrgTmPc3uffXFyc5O/vqatX43XlSpwslmR98snv+uST31ON4emZ8udY375VlT27m2bOPKgPPtikfv02qWxZP33+eT09+2zg3V84ACDTIGADALKcpCSLNmw4o6efLpAq8N323HNF9dxzRXXrVqLWrz+tTz/9Qz16rFf16vkUEpLvjvuk1e3QdZvZbNHly7HWQGoymVKtDb9xI/Gu4/n4uMtkks0NsSTp1q1Ebdp0VtWq5dMHH2zSDz8c1cKFTVS/fqD1DHqePFOs/X19PXTs2JVU41+4cFNSyqXt93L5cqzNXdMvXrz1/8fwkq+vhy5fjpXZbLH5ml+4cCNNY9uDr6+HIiJupmq//fXw8/NQzpzuioqKVXJyss3VA3/8cVEmU8oYkhQRcVOFC/tYtycmmnX5cqz8/T3l7Z3y/XvvvSf16qulUx3PyyvlzzEnJ5O6daukbt0qKTLyplatOqGRI7erefMfdfHiO3Jzczb09QMAjMcl4gCALOerr/bpwoW
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"grafico(4)\n",
|
|||
|
|
"plt.figure(figsize=(10, 6), dpi=100)\n",
|
|||
|
|
"sns.countplot(x=\"emotion\", data=dataset3,order= dataset3['emotion'].value_counts().index,palette=\"husl\")\n",
|
|||
|
|
"plt.xlabel('Emoçoes', color=\"Darkblue\")\n",
|
|||
|
|
"plt.xticks(rotation=90)\n",
|
|||
|
|
"plt.ylabel('Frequência', color='Darkblue')\n",
|
|||
|
|
"plt.title('Distribuição por Emoções', color='Darkblue')\n",
|
|||
|
|
"for i, value in enumerate(dataset3['emotion'].value_counts()):\n",
|
|||
|
|
" plt.text(i, value+250, str(value),ha='center',color='Darkblue')\n",
|
|||
|
|
"plt.tight_layout()\n",
|
|||
|
|
"plt.show() "
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 70,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>Text</th>\n",
|
|||
|
|
" <th>emotion</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>That game hurt.</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>Man I love reddit.</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>I just came home, what the fuck is this lineup...</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>By far the coolest thing I've seen on this thr...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38253</th>\n",
|
|||
|
|
" <td>I just called the Capitol Police. They are not...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38254</th>\n",
|
|||
|
|
" <td>What a great photo and you two look so happy. 😍</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38255</th>\n",
|
|||
|
|
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38256</th>\n",
|
|||
|
|
" <td>Everyone likes [NAME].</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38257</th>\n",
|
|||
|
|
" <td>The FDA has plenty to criticize. But like here...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>38258 rows × 2 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" Text emotion\n",
|
|||
|
|
"0 That game hurt. Sadness\n",
|
|||
|
|
"1 Man I love reddit. Disgust\n",
|
|||
|
|
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness\n",
|
|||
|
|
"3 I just came home, what the fuck is this lineup... Disgust\n",
|
|||
|
|
"4 By far the coolest thing I've seen on this thr... Happiness\n",
|
|||
|
|
"... ... ...\n",
|
|||
|
|
"38253 I just called the Capitol Police. They are not... Anger\n",
|
|||
|
|
"38254 What a great photo and you two look so happy. 😍 Happiness\n",
|
|||
|
|
"38255 Well, I'm glad you're out of all that now. How... Happiness\n",
|
|||
|
|
"38256 Everyone likes [NAME]. Disgust\n",
|
|||
|
|
"38257 The FDA has plenty to criticize. But like here... Anger\n",
|
|||
|
|
"\n",
|
|||
|
|
"[38258 rows x 2 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 70,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3 = dataset3.dropna()\n",
|
|||
|
|
"dataset3"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 71,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>Text</th>\n",
|
|||
|
|
" <th>emotion</th>\n",
|
|||
|
|
" <th>emotion_label</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>That game hurt.</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>Man I love reddit.</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>I just came home, what the fuck is this lineup...</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>By far the coolest thing I've seen on this thr...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38253</th>\n",
|
|||
|
|
" <td>I just called the Capitol Police. They are not...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38254</th>\n",
|
|||
|
|
" <td>What a great photo and you two look so happy. 😍</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38255</th>\n",
|
|||
|
|
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38256</th>\n",
|
|||
|
|
" <td>Everyone likes [NAME].</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38257</th>\n",
|
|||
|
|
" <td>The FDA has plenty to criticize. But like here...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>38258 rows × 3 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" Text emotion \\\n",
|
|||
|
|
"0 That game hurt. Sadness \n",
|
|||
|
|
"1 Man I love reddit. Disgust \n",
|
|||
|
|
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness \n",
|
|||
|
|
"3 I just came home, what the fuck is this lineup... Disgust \n",
|
|||
|
|
"4 By far the coolest thing I've seen on this thr... Happiness \n",
|
|||
|
|
"... ... ... \n",
|
|||
|
|
"38253 I just called the Capitol Police. They are not... Anger \n",
|
|||
|
|
"38254 What a great photo and you two look so happy. 😍 Happiness \n",
|
|||
|
|
"38255 Well, I'm glad you're out of all that now. How... Happiness \n",
|
|||
|
|
"38256 Everyone likes [NAME]. Disgust \n",
|
|||
|
|
"38257 The FDA has plenty to criticize. But like here... Anger \n",
|
|||
|
|
"\n",
|
|||
|
|
" emotion_label \n",
|
|||
|
|
"0 4 \n",
|
|||
|
|
"1 2 \n",
|
|||
|
|
"2 4 \n",
|
|||
|
|
"3 2 \n",
|
|||
|
|
"4 1 \n",
|
|||
|
|
"... ... \n",
|
|||
|
|
"38253 0 \n",
|
|||
|
|
"38254 1 \n",
|
|||
|
|
"38255 1 \n",
|
|||
|
|
"38256 2 \n",
|
|||
|
|
"38257 0 \n",
|
|||
|
|
"\n",
|
|||
|
|
"[38258 rows x 3 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 71,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"emotion_mapping = {\"Anger\":0,\"Happiness\":1,\"Disgust\":2,\"Fear\":3, \"Sadness\":4,\"Surprise\":5}\n",
|
|||
|
|
"dataset3[\"emotion_label\"] = dataset3[\"emotion\"].map(emotion_mapping)\n",
|
|||
|
|
"dataset3"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 72,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3['TextoLema']=preprocess_text_column(dataset3, 'Text')"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 73,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>Text</th>\n",
|
|||
|
|
" <th>emotion</th>\n",
|
|||
|
|
" <th>emotion_label</th>\n",
|
|||
|
|
" <th>TextoLema</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>That game hurt.</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>game hurt</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>Man I love reddit.</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>man love reddit</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>happy sad s imagine team instead ugh</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>I just came home, what the fuck is this lineup...</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>come home fuck lineup love mad bastard</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>By far the coolest thing I've seen on this thr...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>far cool thing ve see thread</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38253</th>\n",
|
|||
|
|
" <td>I just called the Capitol Police. They are not...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>call capitol police affect shutdown fuck shit</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38254</th>\n",
|
|||
|
|
" <td>What a great photo and you two look so happy. 😍</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>great photo look happy</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38255</th>\n",
|
|||
|
|
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>glad awful way act think healthy boundary hostile</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38256</th>\n",
|
|||
|
|
" <td>Everyone likes [NAME].</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>like</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38257</th>\n",
|
|||
|
|
" <td>The FDA has plenty to criticize. But like here...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>fda plenty criticize like usually criticize ho...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>38258 rows × 4 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" Text emotion \\\n",
|
|||
|
|
"0 That game hurt. Sadness \n",
|
|||
|
|
"1 Man I love reddit. Disgust \n",
|
|||
|
|
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness \n",
|
|||
|
|
"3 I just came home, what the fuck is this lineup... Disgust \n",
|
|||
|
|
"4 By far the coolest thing I've seen on this thr... Happiness \n",
|
|||
|
|
"... ... ... \n",
|
|||
|
|
"38253 I just called the Capitol Police. They are not... Anger \n",
|
|||
|
|
"38254 What a great photo and you two look so happy. 😍 Happiness \n",
|
|||
|
|
"38255 Well, I'm glad you're out of all that now. How... Happiness \n",
|
|||
|
|
"38256 Everyone likes [NAME]. Disgust \n",
|
|||
|
|
"38257 The FDA has plenty to criticize. But like here... Anger \n",
|
|||
|
|
"\n",
|
|||
|
|
" emotion_label TextoLema \n",
|
|||
|
|
"0 4 game hurt \n",
|
|||
|
|
"1 2 man love reddit \n",
|
|||
|
|
"2 4 happy sad s imagine team instead ugh \n",
|
|||
|
|
"3 2 come home fuck lineup love mad bastard \n",
|
|||
|
|
"4 1 far cool thing ve see thread \n",
|
|||
|
|
"... ... ... \n",
|
|||
|
|
"38253 0 call capitol police affect shutdown fuck shit \n",
|
|||
|
|
"38254 1 great photo look happy \n",
|
|||
|
|
"38255 1 glad awful way act think healthy boundary hostile \n",
|
|||
|
|
"38256 2 like \n",
|
|||
|
|
"38257 0 fda plenty criticize like usually criticize ho... \n",
|
|||
|
|
"\n",
|
|||
|
|
"[38258 rows x 4 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 73,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3 = dataset3.dropna(subset=['TextoLema'])\n",
|
|||
|
|
"#dataset3 = dataset3[dataset3['TextoLema'].str.split().str.len() > 2]\n",
|
|||
|
|
"dataset3"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 74,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3.to_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoemotionsEN.xlsx\", index=False)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## PREPARAÇÃO DOS DADOS"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 75,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>Text</th>\n",
|
|||
|
|
" <th>emotion</th>\n",
|
|||
|
|
" <th>emotion_label</th>\n",
|
|||
|
|
" <th>TextoLema</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>That game hurt.</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>game hurt</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>Man I love reddit.</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>man love reddit</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>So happy for [NAME]. So sad he's not here. Ima...</td>\n",
|
|||
|
|
" <td>Sadness</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>happy sad s imagine team instead ugh</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>I just came home, what the fuck is this lineup...</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>come home fuck lineup love mad bastard</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>By far the coolest thing I've seen on this thr...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>far cool thing ve see thread</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38253</th>\n",
|
|||
|
|
" <td>I just called the Capitol Police. They are not...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>call capitol police affect shutdown fuck shit</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38254</th>\n",
|
|||
|
|
" <td>What a great photo and you two look so happy. 😍</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>great photo look happy</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38255</th>\n",
|
|||
|
|
" <td>Well, I'm glad you're out of all that now. How...</td>\n",
|
|||
|
|
" <td>Happiness</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>glad awful way act think healthy boundary hostile</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38256</th>\n",
|
|||
|
|
" <td>Everyone likes [NAME].</td>\n",
|
|||
|
|
" <td>Disgust</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>like</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>38257</th>\n",
|
|||
|
|
" <td>The FDA has plenty to criticize. But like here...</td>\n",
|
|||
|
|
" <td>Anger</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>fda plenty criticize like usually criticize ho...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>38258 rows × 4 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" Text emotion \\\n",
|
|||
|
|
"0 That game hurt. Sadness \n",
|
|||
|
|
"1 Man I love reddit. Disgust \n",
|
|||
|
|
"2 So happy for [NAME]. So sad he's not here. Ima... Sadness \n",
|
|||
|
|
"3 I just came home, what the fuck is this lineup... Disgust \n",
|
|||
|
|
"4 By far the coolest thing I've seen on this thr... Happiness \n",
|
|||
|
|
"... ... ... \n",
|
|||
|
|
"38253 I just called the Capitol Police. They are not... Anger \n",
|
|||
|
|
"38254 What a great photo and you two look so happy. 😍 Happiness \n",
|
|||
|
|
"38255 Well, I'm glad you're out of all that now. How... Happiness \n",
|
|||
|
|
"38256 Everyone likes [NAME]. Disgust \n",
|
|||
|
|
"38257 The FDA has plenty to criticize. But like here... Anger \n",
|
|||
|
|
"\n",
|
|||
|
|
" emotion_label TextoLema \n",
|
|||
|
|
"0 4 game hurt \n",
|
|||
|
|
"1 2 man love reddit \n",
|
|||
|
|
"2 4 happy sad s imagine team instead ugh \n",
|
|||
|
|
"3 2 come home fuck lineup love mad bastard \n",
|
|||
|
|
"4 1 far cool thing ve see thread \n",
|
|||
|
|
"... ... ... \n",
|
|||
|
|
"38253 0 call capitol police affect shutdown fuck shit \n",
|
|||
|
|
"38254 1 great photo look happy \n",
|
|||
|
|
"38255 1 glad awful way act think healthy boundary hostile \n",
|
|||
|
|
"38256 2 like \n",
|
|||
|
|
"38257 0 fda plenty criticize like usually criticize ho... \n",
|
|||
|
|
"\n",
|
|||
|
|
"[38258 rows x 4 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 75,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3=pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoemotionsEN.xlsx\")\n",
|
|||
|
|
"dataset3"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 76,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"135"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 76,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3['TextoLema'].isnull().sum()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 77,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"dataset3 = dataset3.dropna(subset=['TextoLema'])"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 78,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"X=dataset3['TextoLema']\n",
|
|||
|
|
"y=dataset3['emotion_label']"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"vectorizer"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 79,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
|
"model_id": "ddafd85f3262424b9f8c4e242a293b68",
|
|||
|
|
"version_major": 2,
|
|||
|
|
"version_minor": 0
|
|||
|
|
},
|
|||
|
|
"text/plain": [
|
|||
|
|
" 0%| | 0/192 [00:00<?, ?it/s]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Melhores parâmetros do CountVectorizer (menor esparsidade): (500, 1, 0.5, (1, 3))\n",
|
|||
|
|
"Esparsidade: 98.23%\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"vectorizer_params = {'max_features': [500, 1000, 2000, 5000],'min_df': [1, 2, 3, 5],'max_df': [0.5, 0.6, 0.7, 0.8],'ngram_range': [(1, 1), (1, 2), (1, 3)]}\n",
|
|||
|
|
"combinations = list(product(vectorizer_params['max_features'], vectorizer_params['min_df'], vectorizer_params['max_df'], vectorizer_params['ngram_range']))\n",
|
|||
|
|
"best_sparsity = float('inf')\n",
|
|||
|
|
"best_params = {}\n",
|
|||
|
|
"with tqdm(total=len(combinations)) as pbar:\n",
|
|||
|
|
" for combo in combinations:\n",
|
|||
|
|
" max_features, min_df, max_df, ngram_range = combo\n",
|
|||
|
|
" vectorizer = CountVectorizer(max_features=max_features,min_df=min_df, max_df=max_df,ngram_range=ngram_range)\n",
|
|||
|
|
" X_vec = vectorizer.fit_transform(dataset3['Text'])\n",
|
|||
|
|
" sparsity = 100 * (1 - X_vec.nnz / (X_vec.shape[0] * X_vec.shape[1]))\n",
|
|||
|
|
" if sparsity < best_sparsity:\n",
|
|||
|
|
" best_sparsity = sparsity\n",
|
|||
|
|
" best_params = combo\n",
|
|||
|
|
" pbar.update(1)\n",
|
|||
|
|
"print(f\"Melhores parâmetros do CountVectorizer (menor esparsidade): {best_params}\")\n",
|
|||
|
|
"print(f\"Esparsidade: {best_sparsity:.2f}%\")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 83,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"['Modelos//vectorizerVF.joblib']"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 83,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"vectorizer = CountVectorizer(max_features = 500, min_df = 1, max_df = 0.5,ngram_range=(1,3))\n",
|
|||
|
|
"X = vectorizer.fit_transform(dataset3['TextoLema']).toarray()\n",
|
|||
|
|
"joblib.dump(vectorizer, 'Modelos//vectorizerVF.joblib')"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"treino 60%, val=20% e teste=20%"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 84,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)\n",
|
|||
|
|
"X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## REGRESSÃO LOGISTICA"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 85,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
|
"\n",
|
|||
|
|
"undersample = RandomUnderSampler(random_state=42)\n",
|
|||
|
|
"X_train_balanced, y_train_balanced = undersample.fit_resample(X_train, y_train)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 87,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
|
"model_id": "f1578f07fc5643c9b02b67e18e2006f2",
|
|||
|
|
"version_major": 2,
|
|||
|
|
"version_minor": 0
|
|||
|
|
},
|
|||
|
|
"text/plain": [
|
|||
|
|
" 0%| | 0/84 [00:00<?, ?it/s]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Melhores Parâmetros: {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}\n",
|
|||
|
|
"Melhor accuracy: 0.6463089802130898\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"param_grid = {\n",
|
|||
|
|
" 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],\n",
|
|||
|
|
" 'solver': ['lbfgs', 'liblinear', 'saga'],\n",
|
|||
|
|
" 'class_weight': [None, 'balanced'],\n",
|
|||
|
|
" 'max_iter': [500, 1000]\n",
|
|||
|
|
"}\n",
|
|||
|
|
"scaler = StandardScaler()\n",
|
|||
|
|
"X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
|
|||
|
|
"grid_search = GridSearchCV(estimator=LogisticRegression(random_state=42), param_grid=param_grid,scoring='accuracy',cv=3, n_jobs=-1)\n",
|
|||
|
|
"n_combinations = len(list(product(*param_grid.values()))) \n",
|
|||
|
|
"with tqdm(total=n_combinations) as pbar: \n",
|
|||
|
|
" grid_search.fit(X_train_scaled, y_train_balanced)\n",
|
|||
|
|
" pbar.update(n_combinations) \n",
|
|||
|
|
"print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
|
|||
|
|
"print(\"Melhor accuracy:\", grid_search.best_score_)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"Melhores Parâmetros: {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}\n",
|
|||
|
|
"\n",
|
|||
|
|
"Melhor accuracy: 0.6601487361454299"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 88,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"classifier = LogisticRegression(C=0.001,solver='liblinear',max_iter=500, class_weight=None, random_state = 42)\n",
|
|||
|
|
"classifier.fit(X_train_balanced,y_train_balanced)\n",
|
|||
|
|
"y_pred = classifier.predict(X_test)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 89,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"['Modelos//logistic_regression_model.joblib']"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 89,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"joblib.dump(classifier, 'Modelos//logistic_regression_model.joblib')"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 90,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAHJCAYAAADkVRHSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx5klEQVR4nOzdd1yV1R/A8c+97I1sEBEEcc8cuXKV5irN0jQt907L1DTNbZp75fq5co+0TM2VI/fe4kJRUECG7A33+f1xE7sBhXIBje/79bov9HnOPc/33Pl9zjnPuSpFURSEEEIIIbKhLuwAhBBCCPHqkkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSROE/RNbOEq8ieV0K8XqTRKEANWq0CZVqJnXrbsixzMcf70Slmkm3bnteqO4TJx7TqtX2fy03fvwJVKqZL1R3bhw5EohKNZMjRwL1Ul90dDITJ56kcuXVWFnNw8npB5o23cLOnff0Un92rl0Lp1q1NZiYzKF8+ZV6qzclJZ2mTbdgZDQbA4NZGBrOwspqHnfuPM1z3d267cHTc5keoswdlWom48efyHX55cuvMmzYkcz/r159HZVqJg8exLx0DN267UGlmpnlZmk5j0qVVjN79vmXrvtV9qKPvRD6YljYARQ1arWK06eDefQoDnd3K519CQmpL/1F+L//XcXPL/Jfy/XqVZl33/V6qWMUlFu3ImnRYhsZGQpDhlSnShUnEhLSWL/ej/fe+5lJk+oxZkwdvR93woSTPHwYy88/v4+Tk7ne6lWpVMyb15jExHQMDdWYmBjg4WGNlZWx3o5RUE6d6pzldftPJk8+TaNGJTL/36pVKU6d6oyrq0We4nBxseDnn9/P/L+iQGhoAkuXXuGrr45gZmZI//5V83SMV82LPvZC6IskCgWsenUnbtyIZOvW23z5ZQ2dfTt33sfCwohixUzz7fju7lav9IdNWloGHTrsxMjIgDNnPsbJ6fkXyvvv+9Cnz36+/fYEbdp4U6WKk16PHRmZTKVKDrRsWUqv9RobG1CxoqNe6ywsb77plqf7Ozqa4+iY9yTMxMQg21haty5FqVLLWbXq+n8uUcjrYy/Ey5KhhwJmYWFEq1al2Lr1TpZ9mzff4sMPfTE01H1aIiISGTjwd0qWXIqx8Wzs7BbSrt0vmd233brt4ccfb/DwYSwq1UxWr77OgwcxqFQzmT37PGXLrsTcfC6rVl3TGXp4Via72791Zy9degVf3xWYmc2lYcNNPHwYm6VMYGAsnTrtws5uIebmc2nadAuXLj35x3p3777PtWsRTJ5cTydJeGbixHoMGlSN9HRN5rbz50N5992fsLdfiLX1fNq02c6NGxGZ+58Nixw8+JBmzbZibj4XF5dFfP31H2RkaOvRDpsEcfToo8zHMKdhmr93AW/ceJMqVX7EzGwujo4/0KXLboKD4zP3JyWlMWrUUUqXXo6JyRysrefzzjtbuXw5TKfeAwce0KDBRmxs5mNvv5DOnXcRFJT1cX0Z//YYgbYnp2XLbVhbz8fZeRGjRx+jR4+9NGq0Kce2z5t3gbJlV2JqOofixZcwYMABYmNTAPD0XMbDh7H8+OONzOGG7IYefvvtPvXqbcDCYi5ubovp1+8A0dHJL9VOIyMDLCyMUKl0ty9ffpUKFVZhYjIHD4+ljB9/IvO5f+bHH69Tvry2LVWq/MjBgw8xNJzF6tXXAe2wiaHhLJYvv4qLyyLs7Bbi56d9DHfs8KdGjbWYms7BxWURQ4YcIiEhNbPupKQ0Bgw4gLv7EkxM5lC27Epmzjync/x/eiyze+xDQuLp0WMvJUosxcxsLrVqrePXX/116lSpZrJo0SV69dqHnd1CrKzm0aHDrzx5kvBSj68omiRRKAQdO5bJHH54JjY2hT17AujUqZxOWUVRaNVqO/v3P+D77xuyf/+HjB9fl4MHA+nX7wAA335bh5YtvXBxseDUqc60avX8jHj8+JN8/XVN1q5tyTvveOrU7eqqLf/X29SpDQDo1atSjvEvXHiRfv0O0KpVKXbsaMubb7rSp88BnTIREYnUrbuBCxeesHBhUzZubI1Go/DWW5u4eTPnIZK9ex9gYKDK8azexcWCBQua8sYbLgAcPhxI3bobUBRYtepdli9vRlBQHHXrbuDWLd3jfPLJbho0cGfXrg/o3Lkc06efY/nya4C2W7daNSeqVXPK8hj+kxMnHtO162+0b1+aPXs+YM6cxhw8GEinTrsyy3z66R5WrrzOqFG12b//Q2bPbsSNGxF07rwrc6Lf2rU3aNbsJ0qUsGLjxtbMmdOYU6eCqVNnA2FheftQz81jFBGRyFtvbSIwMJZVq95l/vwm/PTTHTZsuJljvRs33mTEiKMMHFiVffs+ZOzYOqxd68fnnx8E4Oef38fFxYKWLb1yHG7YteserVtvx8nJnC1b2vD992/x88936dhxV5ayf5eersm8paSk8+BBDEOHHub27ad8+mmFzHJTp56hT5/9vP12SXbubMegQdX4/vtz9OmzP7PMmjU36NZtL/XqFWfHjrZ8+KEvbdv+QkaG7kTMjAyFWbPOs2JFc+bMaUS5cvZs2HCTtm1/oWxZO375pS3jx9dl7Vo/3n//l8zn94svDrNnTwAzZzZi3772vP++N8OH/8GqVddy9Vj+3ZMnCdSsuY6jRx/x3Xf12bbtPTw9rWnb9hfWr/fTKfvNN8fJyNCwaVNrZsxoyM6d9/nii8P/+vgK8YwMPRSCVq1KYWFhpDP88PPPd3FyMqd+/eI6ZUNCErCwMGLWrEbUr+8OQKNGHvj7R7Fs2VUAvL1tcXQ01+mOTUjQnrF16FCG7t2z/9I3MTHU6c68dy+amTPP88EHpRk9+s1s76MoCpMmnaZjxzLMmdMYgGbNPImNTWXJkiuZ5ebMuUBkZDInTnSiZEkbAFq08KJcuZWMHXuCrVvfy7b+oKBYHBzMsLTM3fj9yJFHKV26GL/99gEGBurMeLy9lzN27Am2bHl+nN69K/Ptt9q5DU2aePDLL/7s2nWPvn2r8Oabblhba4/5Il28x449wtzciK+/roWJifbtZG9vyrlzoSiKQlqahri4VBYsaEKHDmUBaNiwBLGxqXz11RGePEnEycmcESOO0ry5Jxs2tM6su1694pQvv4qZM88zfXrDXMf0Mo/R/PkXiYtL4/Llj3Bzs/zzcXDF1zfnSZ1//PEILy8bBg6shlqtomHDElhaGvH0qbY3oFo1Z0xMDHB0NM/xMR0//iRVqzqxffv7qP7sBjA2NmDs2BM8eZKAs3P2cxkePozFyGh2lu2lSxdj0aK36devCgAxMSlMmnSKvn2rMG9ek8y229ub0avXPoYOrUGFCg58++1x2rTx5n//aw5A8+ZeGBmpGTXqWJZjjB79Jq1aeQPa98PXXx/l3Xc9WbeulU4cb7+9ld9+u0+rVt788ccj3nnHk48/1r4GGjXywNLSOHMuzL89ln83e/Z5wsOTuHOnR+b7q2XLUrz99haGDfuDTp3KoVZrH89KlRxYtapF5n3Png1l69bb2dYrRHakR6EQmJkZ0aaNt87ww6ZNt+jYsUzmh+Uzbm6WHDrUkXr1ivPgQQwHDjxgwYKLnDgRTEpKxr8eq2rV3I2Nx8am8N57P+PiYs6PP7bIEsczt28/JSwskTZtvHW2d+hQRuf/Bw8GUrWqI8WLW2We9anVKlq08OLAgYc5xmFoqM5yFpeThIRUzp0LpUOHMplfgAC2tqa0aePNkSNBOuXr1NH9snJ3tyIhIS1Xx8pJw4YlSEhIo2LF1YwadZRjxx7RrJknY8fWRaVSYWxswN69H9KhQ1keP47j8OFAli69wq5d2kmrKSnp3L79lNDQBDp1KqtTt7e3LXXquGZpx4vI7WN06FAQdeu6ZSYJACVL2lC3bs5JU+PGJbh9+ylvvLGWiRNPcv58KJ07l+Pzz6vnKrakpDQuXnxCu3aldV5vHTuW5fbtnjkmCaDtDTt3rgvnznVh//4PadDAHTc3S378sQX9+1fNrO/UqWC
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"cm = confusion_matrix(y_test, y_pred)\n",
|
|||
|
|
"CMatrix = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)\n",
|
|||
|
|
"CMatrix.plot()\n",
|
|||
|
|
"plt.title('Matriz de Confusão Logistic Regression', color=\"darkblue\")\n",
|
|||
|
|
"plt.xlabel('Previsões', color=\"darkblue\")\n",
|
|||
|
|
"plt.ylabel('Real', color=\"darkblue\")\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 91,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
" precision recall f1-score support\n",
|
|||
|
|
"\n",
|
|||
|
|
" 0 0.68 0.49 0.57 1565\n",
|
|||
|
|
" 1 0.73 0.55 0.63 1578\n",
|
|||
|
|
" 2 0.77 0.78 0.77 1534\n",
|
|||
|
|
" 3 0.26 0.59 0.36 584\n",
|
|||
|
|
" 4 0.65 0.55 0.60 1349\n",
|
|||
|
|
" 5 0.46 0.58 0.51 1015\n",
|
|||
|
|
"\n",
|
|||
|
|
" accuracy 0.59 7625\n",
|
|||
|
|
" macro avg 0.59 0.59 0.57 7625\n",
|
|||
|
|
"weighted avg 0.64 0.59 0.60 7625\n",
|
|||
|
|
"\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"classes = np.unique(y)\n",
|
|||
|
|
"print(classification_report(y_test.tolist(), y_pred.tolist(), labels=classes.tolist()))"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 101,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
|
"model_id": "75d04b694d4247f2a8297581f2b64cc9",
|
|||
|
|
"version_major": 2,
|
|||
|
|
"version_minor": 0
|
|||
|
|
},
|
|||
|
|
"text/plain": [
|
|||
|
|
" 0%| | 0/13122 [00:00<?, ?it/s]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"ename": "KeyboardInterrupt",
|
|||
|
|
"evalue": "",
|
|||
|
|
"output_type": "error",
|
|||
|
|
"traceback": [
|
|||
|
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
|||
|
|
"Cell \u001b[1;32mIn[101], line 17\u001b[0m\n\u001b[0;32m 15\u001b[0m n_combinations \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mlist\u001b[39m(product(\u001b[38;5;241m*\u001b[39mparam_grid\u001b[38;5;241m.\u001b[39mvalues()))) \n\u001b[0;32m 16\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m tqdm(total\u001b[38;5;241m=\u001b[39mn_combinations) \u001b[38;5;28;01mas\u001b[39;00m pbar: \n\u001b[1;32m---> 17\u001b[0m \u001b[43mgrid_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_scaled\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train_balanced\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 18\u001b[0m pbar\u001b[38;5;241m.\u001b[39mupdate(n_combinations)\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMelhores Parâmetros:\u001b[39m\u001b[38;5;124m\"\u001b[39m, grid_search\u001b[38;5;241m.\u001b[39mbest_params_)\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:1019\u001b[0m, in \u001b[0;36mBaseSearchCV.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 1013\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_results(\n\u001b[0;32m 1014\u001b[0m all_candidate_params, n_splits, all_out, all_more_results\n\u001b[0;32m 1015\u001b[0m )\n\u001b[0;32m 1017\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m-> 1019\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluate_candidates\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1021\u001b[0m \u001b[38;5;66;03m# multimetric is determined here because in the case of a callable\u001b[39;00m\n\u001b[0;32m 1022\u001b[0m \u001b[38;5;66;03m# self.scoring the return type is only known after calling\u001b[39;00m\n\u001b[0;32m 1023\u001b[0m first_test_score \u001b[38;5;241m=\u001b[39m all_out[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_scores\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:1573\u001b[0m, in \u001b[0;36mGridSearchCV._run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1571\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_run_search\u001b[39m(\u001b[38;5;28mself\u001b[39m, evaluate_candidates):\n\u001b[0;32m 1572\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Search all candidates in param_grid\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1573\u001b[0m \u001b[43mevaluate_candidates\u001b[49m\u001b[43m(\u001b[49m\u001b[43mParameterGrid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparam_grid\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\model_selection\\_search.py:965\u001b[0m, in \u001b[0;36mBaseSearchCV.fit.<locals>.evaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 958\u001b[0m \u001b[38;5;28mprint\u001b[39m(\n\u001b[0;32m 959\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFitting \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m folds for each of \u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m candidates,\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 960\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m totalling \u001b[39m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m fits\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 961\u001b[0m n_splits, n_candidates, n_candidates \u001b[38;5;241m*\u001b[39m n_splits\n\u001b[0;32m 962\u001b[0m )\n\u001b[0;32m 963\u001b[0m )\n\u001b[1;32m--> 965\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 966\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 967\u001b[0m \u001b[43m \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase_estimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 968\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 970\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 971\u001b[0m \u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_splits\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mcandidate_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_candidates\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_and_score_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mproduct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcandidate_params\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
|||
|
|
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
|
|||
|
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"param_grid = {\n",
|
|||
|
|
" 'n_estimators': [50, 100, 200],\n",
|
|||
|
|
" 'learning_rate': [0.01, 0.1, 0.2],\n",
|
|||
|
|
" 'max_depth': [3, 5, 10],\n",
|
|||
|
|
" 'subsample': [0.8, 1.0],\n",
|
|||
|
|
" 'colsample_bytree': [0.6, 0.8, 1.0],\n",
|
|||
|
|
" 'gamma': [0, 0.1, 0.5],\n",
|
|||
|
|
" 'min_child_weight': [1, 3, 5],\n",
|
|||
|
|
" 'reg_alpha': [0, 0.01, 0.1],\n",
|
|||
|
|
" 'reg_lambda': [1, 1.5, 2]\n",
|
|||
|
|
"}\n",
|
|||
|
|
"scaler = StandardScaler()\n",
|
|||
|
|
"X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
|
|||
|
|
"grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, tree_method='gpu_hist'),param_grid=param_grid,scoring='accuracy',cv=2, n_jobs=-1)\n",
|
|||
|
|
"n_combinations = len(list(product(*param_grid.values()))) \n",
|
|||
|
|
"with tqdm(total=n_combinations) as pbar: \n",
|
|||
|
|
" grid_search.fit(X_train_scaled, y_train_balanced)\n",
|
|||
|
|
" pbar.update(n_combinations)\n",
|
|||
|
|
"print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
|
|||
|
|
"print(\"Melhor accuracy:\", grid_search.best_score_)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"Melhores Parâmetros: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}\n",
|
|||
|
|
"\n",
|
|||
|
|
"Melhor accuracy: 0.6592488954270308"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 93,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"c:\\Users\\garci\\anaconda32\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [15:50:34] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n",
|
|||
|
|
"Parameters: { \"gama\" } are not used.\n",
|
|||
|
|
"\n",
|
|||
|
|
" warnings.warn(smsg, UserWarning)\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"xgb = XGBClassifier(colsample_bytree=0.8, gama=0.1, learning_rate= 0.1, max_depth=5,min_child_weight=1, n_estimators=100, subsample=0.8, reg_alpha=0, reg_lambda=1)\n",
|
|||
|
|
"xgb.fit(X_train_scaled,y_train_balanced)\n",
|
|||
|
|
"previsoesxgb = xgb.predict(X_test)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 98,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"['Modelos//logistic_regression_model.joblib']"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 98,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"joblib.dump(xgb, 'Modelos//logistic_regression_model.joblib')"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 94,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAHJCAYAAADkVRHSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx5klEQVR4nOzdd1yV1R/A8c+97I1sEBEEcc8cuXKV5irN0jQt907L1DTNbZp75fq5co+0TM2VI/fe4kJRUECG7A33+f1xE7sBhXIBje/79bov9HnOPc/33Pl9zjnPuSpFURSEEEIIIbKhLuwAhBBCCPHqkkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSREEIIYQQOZJEQQghhBA5kkRBCCGEEDmSROE/RNbOEq8ieV0K8XqTRKEANWq0CZVqJnXrbsixzMcf70Slmkm3bnteqO4TJx7TqtX2fy03fvwJVKqZL1R3bhw5EohKNZMjRwL1Ul90dDITJ56kcuXVWFnNw8npB5o23cLOnff0Un92rl0Lp1q1NZiYzKF8+ZV6qzclJZ2mTbdgZDQbA4NZGBrOwspqHnfuPM1z3d267cHTc5keoswdlWom48efyHX55cuvMmzYkcz/r159HZVqJg8exLx0DN267UGlmpnlZmk5j0qVVjN79vmXrvtV9qKPvRD6YljYARQ1arWK06eDefQoDnd3K519CQmpL/1F+L//XcXPL/Jfy/XqVZl33/V6qWMUlFu3ImnRYhsZGQpDhlSnShUnEhLSWL/ej/fe+5lJk+oxZkwdvR93woSTPHwYy88/v4+Tk7ne6lWpVMyb15jExHQMDdWYmBjg4WGNlZWx3o5RUE6d6pzldftPJk8+TaNGJTL/36pVKU6d6oyrq0We4nBxseDnn9/P/L+iQGhoAkuXXuGrr45gZmZI//5V83SMV82LPvZC6IskCgWsenUnbtyIZOvW23z5ZQ2dfTt33sfCwohixUzz7fju7lav9IdNWloGHTrsxMjIgDNnPsbJ6fkXyvvv+9Cnz36+/fYEbdp4U6WKk16PHRmZTKVKDrRsWUqv9RobG1CxoqNe6ywsb77plqf7Ozqa4+iY9yTMxMQg21haty5FqVLLWbXq+n8uUcjrYy/Ey5KhhwJmYWFEq1al2Lr1TpZ9mzff4sMPfTE01H1aIiISGTjwd0qWXIqx8Wzs7BbSrt0vmd233brt4ccfb/DwYSwq1UxWr77OgwcxqFQzmT37PGXLrsTcfC6rVl3TGXp4Via72791Zy9degVf3xWYmc2lYcNNPHwYm6VMYGAsnTrtws5uIebmc2nadAuXLj35x3p3777PtWsRTJ5cTydJeGbixHoMGlSN9HRN5rbz50N5992fsLdfiLX1fNq02c6NGxGZ+58Nixw8+JBmzbZibj4XF5dFfP31H2RkaOvRDpsEcfToo8zHMKdhmr93AW/ceJMqVX7EzGwujo4/0KXLboKD4zP3JyWlMWrUUUqXXo6JyRysrefzzjtbuXw5TKfeAwce0KDBRmxs5mNvv5DOnXcRFJT1cX0Z//YYgbYnp2XLbVhbz8fZeRGjRx+jR4+9NGq0Kce2z5t3gbJlV2JqOofixZcwYMABYmNTAPD0XMbDh7H8+OONzOGG7IYefvvtPvXqbcDCYi5ubovp1+8A0dHJL9VOIyMDLCyMUKl0ty9ffpUKFVZhYjIHD4+ljB9/IvO5f+bHH69Tvry2LVWq/MjBgw8xNJzF6tXXAe2wiaHhLJYvv4qLyyLs7Bbi56d9DHfs8KdGjbWYms7BxWURQ4YcIiEhNbPupKQ0Bgw4gLv7EkxM5lC27Epmzjync/x/eiyze+xDQuLp0WMvJUosxcxsLrVqrePXX/116lSpZrJo0SV69dqHnd1CrKzm0aHDrzx5kvBSj68omiRRKAQdO5bJHH54JjY2hT17AujUqZxOWUVRaNVqO/v3P+D77xuyf/+HjB9fl4MHA+nX7wAA335bh5YtvXBxseDUqc60avX8jHj8+JN8/XVN1q5tyTvveOrU7eqqLf/X29SpDQDo1atSjvEvXHiRfv0O0KpVKXbsaMubb7rSp88BnTIREYnUrbuBCxeesHBhUzZubI1Go/DWW5u4eTPnIZK9ex9gYKDK8azexcWCBQua8sYbLgAcPhxI3bobUBRYtepdli9vRlBQHHXrbuDWLd3jfPLJbho0cGfXrg/o3Lkc06efY/nya4C2W7daNSeqVXPK8hj+kxMnHtO162+0b1+aPXs+YM6cxhw8GEinTrsyy3z66R5WrrzOqFG12b//Q2bPbsSNGxF07rwrc6Lf2rU3aNbsJ0qUsGLjxtbMmdOYU6eCqVNnA2FheftQz81jFBGRyFtvbSIwMJZVq95l/vwm/PTTHTZsuJljvRs33mTEiKMMHFiVffs+ZOzYOqxd68fnnx8E4Oef38fFxYKWLb1yHG7YteserVtvx8nJnC1b2vD992/x88936dhxV5ayf5eersm8paSk8+BBDEOHHub27ad8+mmFzHJTp56hT5/9vP12SXbubMegQdX4/vtz9OmzP7PMmjU36NZtL/XqFWfHjrZ8+KEvbdv+QkaG7kTMjAyFWbPOs2JFc+bMaUS5cvZs2HCTtm1/oWxZO375pS3jx9dl7Vo/3n//l8zn94svDrNnTwAzZzZi3772vP++N8OH/8GqVddy9Vj+3ZMnCdSsuY6jRx/x3Xf12bbtPTw9rWnb9hfWr/fTKfvNN8fJyNCwaVNrZsxoyM6d9/nii8P/+vgK8YwMPRSCVq1KYWFhpDP88PPPd3FyMqd+/eI6ZUNCErCwMGLWrEbUr+8OQKNGHvj7R7Fs2VUAvL1tcXQ01+mOTUjQnrF16FCG7t2z/9I3MTHU6c68dy+amTPP88EHpRk9+s1s76MoCpMmnaZjxzLMmdMYgGbNPImNTWXJkiuZ5ebMuUBkZDInTnSiZEkbAFq08KJcuZWMHXuCrVvfy7b+oKBYHBzMsLTM3fj9yJFHKV26GL/99gEGBurMeLy9lzN27Am2bHl+nN69K/Ptt9q5DU2aePDLL/7s2nWPvn2r8Oabblhba4/5Il28x449wtzciK+/roWJifbtZG9vyrlzoSiKQlqahri4VBYsaEKHDmUBaNiwBLGxqXz11RGePEnEycmcESOO0ry5Jxs2tM6su1694pQvv4qZM88zfXrDXMf0Mo/R/PkXiYtL4/Llj3Bzs/zzcXDF1zfnSZ1//PEILy8bBg6shlqtomHDElhaGvH0qbY3oFo1Z0xMDHB0NM/xMR0//iRVqzqxffv7qP7sBjA2NmDs2BM8eZKAs3P2cxkePozFyGh2lu2lSxdj0aK36devCgAxMSlMmnSKvn2rMG9ek8y229ub0avXPoYOrUGFCg58++1x2rTx5n//aw5A8+ZeGBmpGTXqWJZjjB79Jq1aeQPa98PXXx/l3Xc9WbeulU4cb7+9ld9+u0+rVt788ccj3nnHk48/1r4GGjXywNLSOHMuzL89ln83e/Z5wsOTuHOnR+b7q2XLUrz99haGDfuDTp3KoVZrH89KlRxYtapF5n3Png1l69bb2dYrRHakR6EQmJkZ0aaNt87ww6ZNt+jYsUzmh+Uzbm6WHDrUkXr1ivPgQQwHDjxgwYKLnDgRTEpKxr8eq2rV3I2Nx8am8N57P+PiYs6PP7bIEsczt28/JSwskTZtvHW2d+hQRuf/Bw8GUrWqI8WLW2We9anVKlq08OLAgYc5xmFoqM5yFpeThIRUzp0LpUOHMplfgAC2tqa0aePNkSNBOuXr1NH9snJ3tyIhIS1Xx8pJw4YlSEhIo2LF1YwadZRjxx7RrJknY8fWRaVSYWxswN69H9KhQ1keP47j8OFAli69wq5d2kmrKSnp3L79lNDQBDp1KqtTt7e3LXXquGZpx4vI7WN06FAQdeu6ZSYJACVL2lC3bs5JU+PGJbh9+ylvvLGWiRNPcv58KJ07l+Pzz6vnKrakpDQuXnxCu3aldV5vHTuW5fbtnjkmCaDtDTt3rgvnznVh//4PadDAHTc3S378sQX9+1fNrO/UqWC
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"cm2 = confusion_matrix(y_test, y_pred)\n",
|
|||
|
|
"CMatrix = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=xgb.classes_)\n",
|
|||
|
|
"CMatrix.plot()\n",
|
|||
|
|
"plt.title('Matriz de Confusão Logistic Regression', color=\"darkblue\")\n",
|
|||
|
|
"plt.xlabel('Previsões', color=\"darkblue\")\n",
|
|||
|
|
"plt.ylabel('Real', color=\"darkblue\")\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 96,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
" precision recall f1-score support\n",
|
|||
|
|
"\n",
|
|||
|
|
" 0 0.52 0.77 0.62 1565\n",
|
|||
|
|
" 1 0.77 0.58 0.66 1578\n",
|
|||
|
|
" 2 0.78 0.82 0.80 1534\n",
|
|||
|
|
" 3 0.66 0.50 0.57 584\n",
|
|||
|
|
" 4 0.71 0.58 0.64 1349\n",
|
|||
|
|
" 5 0.61 0.56 0.58 1015\n",
|
|||
|
|
"\n",
|
|||
|
|
" accuracy 0.66 7625\n",
|
|||
|
|
" macro avg 0.67 0.64 0.65 7625\n",
|
|||
|
|
"weighted avg 0.68 0.66 0.66 7625\n",
|
|||
|
|
"\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"\n",
|
|||
|
|
"print(classification_report(y_test, previsoesxgb))"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 97,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
|
"model_id": "65bf7c3e26fe4a3eb4a49169477f0d65",
|
|||
|
|
"version_major": 2,
|
|||
|
|
"version_minor": 0
|
|||
|
|
},
|
|||
|
|
"text/plain": [
|
|||
|
|
" 0%| | 0/5 [00:00<?, ?it/s]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Melhores Parâmetros: {'var_smoothing': 1e-05}\n",
|
|||
|
|
"Melhor accuracy: 0.31687985071238955\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"scaler = StandardScaler()\n",
|
|||
|
|
"X_train_scaled = scaler.fit_transform(X_train_balanced)\n",
|
|||
|
|
"param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}\n",
|
|||
|
|
"model = GaussianNB()\n",
|
|||
|
|
"grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)\n",
|
|||
|
|
"n_combinations = len(list(product(*param_grid.values()))) \n",
|
|||
|
|
"with tqdm(total=n_combinations) as pbar: \n",
|
|||
|
|
" grid_search.fit(X_train_scaled, y_train)\n",
|
|||
|
|
" pbar.update(n_combinations)\n",
|
|||
|
|
"print(\"Melhores Parâmetros:\", grid_search.best_params_)\n",
|
|||
|
|
"print(\"Melhor accuracy:\", grid_search.best_score_)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"sc=StandardScaler()\n",
|
|||
|
|
"x_treino2=sc.fit_transform(X_train_scaled)\n",
|
|||
|
|
"x_teste2=sc.transform(X_test)\n",
|
|||
|
|
"naive=GaussianNB(var_smoothing= 1e-05)\n",
|
|||
|
|
"naive.fit(x_treino2, y_train_balanced)\n",
|
|||
|
|
"previsoesnb = naive.predict(x_teste2)"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"metadata": {
|
|||
|
|
"kernelspec": {
|
|||
|
|
"display_name": "base",
|
|||
|
|
"language": "python",
|
|||
|
|
"name": "python3"
|
|||
|
|
},
|
|||
|
|
"language_info": {
|
|||
|
|
"codemirror_mode": {
|
|||
|
|
"name": "ipython",
|
|||
|
|
"version": 3
|
|||
|
|
},
|
|||
|
|
"file_extension": ".py",
|
|||
|
|
"mimetype": "text/x-python",
|
|||
|
|
"name": "python",
|
|||
|
|
"nbconvert_exporter": "python",
|
|||
|
|
"pygments_lexer": "ipython3",
|
|||
|
|
"version": "3.12.3"
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
"nbformat": 4,
|
|||
|
|
"nbformat_minor": 2
|
|||
|
|
}
|