{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## BIBLIOTECA" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import numpy as np\n", "from matplotlib import pyplot as plt\n", "import statistics as st\n", "from statistics import mode, mean\n", "from scipy.stats import spearmanr, pearsonr, skew, kendalltau, norm\n", "from collections import Counter\n", "from datetime import datetime\n", "from keras.models import load_model\n", "import warnings\n", "#!pip install wordcloud\n", "import string\n", "from wordcloud import WordCloud\n", "from xgboost import XGBClassifier\n", "import nltk\n", "import torch\n", "from torch import nn\n", "from torch.utils.data import DataLoader, Dataset\n", "#nltk.download('all')\n", "import spacy\n", "import pickle \n", "from sklearn.datasets import load_files, make_classification\n", "from docx import Document\n", "import os\n", "import re\n", "import pdfplumber\n", "import gensim\n", "#!pip install gensim\n", "from gensim.models import FastText\n", "from nltk.corpus import stopwords\n", "from joblib import Parallel, delayed\n", "from nltk.tokenize import word_tokenize\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from textblob import TextBlob\n", "from deep_translator import GoogleTranslator\n", "from pyannote.audio import Pipeline\n", "from transformers import pipeline\n", "from nltk.stem import WordNetLemmatizer\n", "from reportlab.lib.pagesizes import A4\n", "from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Image\n", "from reportlab.lib import colors\n", "from PIL import Image, ImageOps\n", "from tkinter import filedialog\n", "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid\n", "import tabula\n", "import PyPDF2\n", "import pdfplumber\n", "from datetime import datetime\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "from imblearn.combine import SMOTEENN\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.ensemble import RandomForestClassifier\n", "import joblib\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "import fitz\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, classification_report, f1_score, precision_score, recall_score, roc_auc_score\n", "from tqdm.notebook import tqdm\n", "from tensorflow.keras.models import Sequential, load_model\n", "from tensorflow.keras.optimizers import Adam\n", "from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Flatten, Dropout, BatchNormalization\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from keras.utils import to_categorical\n", "from itertools import product\n", "#!python -m spacy download pt_core_news_sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## FUNÇÕES" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "def grafico(a):\n", " if a == 1:\n", " custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False}\n", " sns.set_theme(style=\"ticks\", rc=custom_params, palette = \"pastel\")\n", " palette = \"pastel\"\n", " elif a == 2:\n", " sns.set_theme(style=\"white\", palette = \"Set2\")\n", " palette = \"Set2\"\n", " elif a == 3:\n", " sns.set_theme(style=\"whitegrid\", palette = \"pastel\")\n", " palette = \"pastel\"\n", " elif a == 4:\n", " sns.set_theme()\n", " palette = \"husl\"\n", " return palette" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tratamento para portugues" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n", "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n", "C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\3345901888.py:8: SyntaxWarning: invalid escape sequence '\\s'\n", " df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '')\n" ] } ], "source": [ "portuguesstopwords = set(stopwords.words('portuguese'))\n", "nlp = spacy.load(\"pt_core_news_sm\")\n", "def preprocess_text_column_pt(df, column_name):\n", " if column_name not in df.columns:\n", " raise ValueError(f\"A coluna '{column_name}' não existe no dataset.\")\n", " df_copy = df.copy()\n", " df_copy[column_name] = df_copy[column_name].str.lower() \n", " df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '') \n", " df_copy[column_name] = df_copy[column_name].str.strip() \n", " df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True) \n", " df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n", " df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n", " df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in portuguesstopwords]))\n", " df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True) \n", " df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True) \n", " df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))\n", " df_copy['TextoLema'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)])) \n", "\n", " return df_copy['TextoLema']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tratamento para ingles" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n", "<>:8: SyntaxWarning: invalid escape sequence '\\s'\n", "C:\\Users\\garci\\AppData\\Local\\Temp\\ipykernel_40992\\4039191732.py:8: SyntaxWarning: invalid escape sequence '\\s'\n", " df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n" ] } ], "source": [ "english_stopwords = set(stopwords.words('english'))\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "def preprocess_text_column(df, column_name):\n", " if column_name not in df.columns:\n", " raise ValueError(f\"The column '{column_name}' does not exist in the dataset.\")\n", " df_copy = df.copy()\n", " df_copy[column_name] = df_copy[column_name].str.lower()\n", " df_copy[column_name] = df_copy[column_name].str.replace('[^a-z\\s]', '', regex=True)\n", " df_copy[column_name] = df_copy[column_name].str.replace(r'\\s+', ' ', regex=True).str.strip()\n", " df_copy[column_name] = df_copy[column_name].str.replace(r'\\d+', '', regex=True)\n", " df_copy[column_name] = df_copy[column_name].str.translate(str.maketrans('', '', string.punctuation))\n", " df_copy[column_name] = df_copy[column_name].str.replace(r'http\\S+|https\\S+|www\\S+', '', regex=True)\n", " df_copy[column_name] = df_copy[column_name].str.replace(r'\\S+@\\S+', '', regex=True)\n", " df_copy[column_name] = df_copy[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in english_stopwords and len(word) > 2]))\n", " df_copy['LemmatizedText'] = df_copy[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_punct and not token.is_stop]))\n", " return df_copy['LemmatizedText']" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Text emotion\n", "0 That game hurt. Sadness\n", "1 Man I love reddit. Disgust\n", "2 So happy for [NAME]. So sad he's not here. Ima... Sadness\n", "3 I just came home, what the fuck is this lineup... Disgust\n", "4 By far the coolest thing I've seen on this thr... Happiness\n" ] } ], "source": [ "colunas = [\"Text\", \"emotion\"]\n", "dataset3 = pd.read_excel(\"C:\\\\Users\\\\garci\\\\OneDrive\\\\Área de Trabalho\\\\Programa PJM\\\\Programa final\\\\Dataset\\\\GoEmotions_parateste.xlsx\",names=colunas)\n", "print(dataset3.head())\n" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Sadness', 'Disgust', 'Happiness', 'Surprise', 'Anger', 'Fear'],\n", " dtype=object)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset3[\"emotion\"].unique()" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | V. Absolutos | \n", "V. Relativos (%) | \n", "
|---|---|---|
| emotion | \n", "\n", " | \n", " |
| Happiness | \n", "7907 | \n", "20.67 | \n", "
| Anger | \n", "7844 | \n", "20.50 | \n", "
| Disgust | \n", "7678 | \n", "20.07 | \n", "
| Sadness | \n", "6758 | \n", "17.66 | \n", "
| Surprise | \n", "5144 | \n", "13.45 | \n", "
| Fear | \n", "2927 | \n", "7.65 | \n", "
| \n", " | Text | \n", "emotion | \n", "
|---|---|---|
| 0 | \n", "That game hurt. | \n", "Sadness | \n", "
| 1 | \n", "Man I love reddit. | \n", "Disgust | \n", "
| 2 | \n", "So happy for [NAME]. So sad he's not here. Ima... | \n", "Sadness | \n", "
| 3 | \n", "I just came home, what the fuck is this lineup... | \n", "Disgust | \n", "
| 4 | \n", "By far the coolest thing I've seen on this thr... | \n", "Happiness | \n", "
| ... | \n", "... | \n", "... | \n", "
| 38253 | \n", "I just called the Capitol Police. They are not... | \n", "Anger | \n", "
| 38254 | \n", "What a great photo and you two look so happy. 😍 | \n", "Happiness | \n", "
| 38255 | \n", "Well, I'm glad you're out of all that now. How... | \n", "Happiness | \n", "
| 38256 | \n", "Everyone likes [NAME]. | \n", "Disgust | \n", "
| 38257 | \n", "The FDA has plenty to criticize. But like here... | \n", "Anger | \n", "
38258 rows × 2 columns
\n", "| \n", " | Text | \n", "emotion | \n", "emotion_label | \n", "
|---|---|---|---|
| 0 | \n", "That game hurt. | \n", "Sadness | \n", "4 | \n", "
| 1 | \n", "Man I love reddit. | \n", "Disgust | \n", "2 | \n", "
| 2 | \n", "So happy for [NAME]. So sad he's not here. Ima... | \n", "Sadness | \n", "4 | \n", "
| 3 | \n", "I just came home, what the fuck is this lineup... | \n", "Disgust | \n", "2 | \n", "
| 4 | \n", "By far the coolest thing I've seen on this thr... | \n", "Happiness | \n", "1 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 38253 | \n", "I just called the Capitol Police. They are not... | \n", "Anger | \n", "0 | \n", "
| 38254 | \n", "What a great photo and you two look so happy. 😍 | \n", "Happiness | \n", "1 | \n", "
| 38255 | \n", "Well, I'm glad you're out of all that now. How... | \n", "Happiness | \n", "1 | \n", "
| 38256 | \n", "Everyone likes [NAME]. | \n", "Disgust | \n", "2 | \n", "
| 38257 | \n", "The FDA has plenty to criticize. But like here... | \n", "Anger | \n", "0 | \n", "
38258 rows × 3 columns
\n", "| \n", " | Text | \n", "emotion | \n", "emotion_label | \n", "TextoLema | \n", "
|---|---|---|---|---|
| 0 | \n", "That game hurt. | \n", "Sadness | \n", "4 | \n", "game hurt | \n", "
| 1 | \n", "Man I love reddit. | \n", "Disgust | \n", "2 | \n", "man love reddit | \n", "
| 2 | \n", "So happy for [NAME]. So sad he's not here. Ima... | \n", "Sadness | \n", "4 | \n", "happy sad s imagine team instead ugh | \n", "
| 3 | \n", "I just came home, what the fuck is this lineup... | \n", "Disgust | \n", "2 | \n", "come home fuck lineup love mad bastard | \n", "
| 4 | \n", "By far the coolest thing I've seen on this thr... | \n", "Happiness | \n", "1 | \n", "far cool thing ve see thread | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 38253 | \n", "I just called the Capitol Police. They are not... | \n", "Anger | \n", "0 | \n", "call capitol police affect shutdown fuck shit | \n", "
| 38254 | \n", "What a great photo and you two look so happy. 😍 | \n", "Happiness | \n", "1 | \n", "great photo look happy | \n", "
| 38255 | \n", "Well, I'm glad you're out of all that now. How... | \n", "Happiness | \n", "1 | \n", "glad awful way act think healthy boundary hostile | \n", "
| 38256 | \n", "Everyone likes [NAME]. | \n", "Disgust | \n", "2 | \n", "like | \n", "
| 38257 | \n", "The FDA has plenty to criticize. But like here... | \n", "Anger | \n", "0 | \n", "fda plenty criticize like usually criticize ho... | \n", "
38258 rows × 4 columns
\n", "| \n", " | Text | \n", "emotion | \n", "emotion_label | \n", "TextoLema | \n", "
|---|---|---|---|---|
| 0 | \n", "That game hurt. | \n", "Sadness | \n", "4 | \n", "game hurt | \n", "
| 1 | \n", "Man I love reddit. | \n", "Disgust | \n", "2 | \n", "man love reddit | \n", "
| 2 | \n", "So happy for [NAME]. So sad he's not here. Ima... | \n", "Sadness | \n", "4 | \n", "happy sad s imagine team instead ugh | \n", "
| 3 | \n", "I just came home, what the fuck is this lineup... | \n", "Disgust | \n", "2 | \n", "come home fuck lineup love mad bastard | \n", "
| 4 | \n", "By far the coolest thing I've seen on this thr... | \n", "Happiness | \n", "1 | \n", "far cool thing ve see thread | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 38253 | \n", "I just called the Capitol Police. They are not... | \n", "Anger | \n", "0 | \n", "call capitol police affect shutdown fuck shit | \n", "
| 38254 | \n", "What a great photo and you two look so happy. 😍 | \n", "Happiness | \n", "1 | \n", "great photo look happy | \n", "
| 38255 | \n", "Well, I'm glad you're out of all that now. How... | \n", "Happiness | \n", "1 | \n", "glad awful way act think healthy boundary hostile | \n", "
| 38256 | \n", "Everyone likes [NAME]. | \n", "Disgust | \n", "2 | \n", "like | \n", "
| 38257 | \n", "The FDA has plenty to criticize. But like here... | \n", "Anger | \n", "0 | \n", "fda plenty criticize like usually criticize ho... | \n", "
38258 rows × 4 columns
\n", "