{ "cells": [ { "cell_type": "markdown", "id": "aa5c1491", "metadata": { "papermill": { "duration": 0.012997, "end_time": "2024-06-08T05:23:03.349402", "exception": false, "start_time": "2024-06-08T05:23:03.336405", "status": "completed" }, "tags": [] }, "source": [ "## 导入库" ] }, { "cell_type": "code", "execution_count": 1, "id": "2ff051d2", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:03.375582Z", "iopub.status.busy": "2024-06-08T05:23:03.375090Z", "iopub.status.idle": "2024-06-08T05:23:06.135626Z", "shell.execute_reply": "2024-06-08T05:23:06.134675Z" }, "papermill": { "duration": 2.776387, "end_time": "2024-06-08T05:23:06.138287", "exception": false, "start_time": "2024-06-08T05:23:03.361900", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import re\n", "import nltk\n", "from nltk.corpus.reader.tagged import ToktokTokenizer\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "from scipy.stats import chi2_contingency\n", "\n", "import warnings\n", "from sklearn.exceptions import UndefinedMetricWarning\n", "warnings.filterwarnings(\"ignore\", category=UndefinedMetricWarning)" ] }, { "cell_type": "markdown", "id": "afa5b530", "metadata": { "papermill": { "duration": 0.012953, "end_time": "2024-06-08T05:23:06.163346", "exception": false, "start_time": "2024-06-08T05:23:06.150393", "status": "completed" }, "tags": [] }, "source": [ "## 读取数据" ] }, { "cell_type": "code", "execution_count": 2, "id": "fa696287", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:06.188980Z", "iopub.status.busy": "2024-06-08T05:23:06.187975Z", "iopub.status.idle": "2024-06-08T05:23:08.649105Z", "shell.execute_reply": "2024-06-08T05:23:08.647767Z" }, "papermill": { "duration": 2.476289, "end_time": "2024-06-08T05:23:08.651497", "exception": false, "start_time": "2024-06-08T05:23:06.175208", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0textEmotion
00i seriously hate one subject to death but now ...hate
11im so full of life i feel appalledneutral
22i sit here to write i start to dig out my feel...neutral
33ive been really angry with r and i feel like a...anger
44i feel suspicious if there is no one outside l...neutral
\n", "
" ], "text/plain": [ " Unnamed: 0 text Emotion\n", "0 0 i seriously hate one subject to death but now ... hate\n", "1 1 im so full of life i feel appalled neutral\n", "2 2 i sit here to write i start to dig out my feel... neutral\n", "3 3 ive been really angry with r and i feel like a... anger\n", "4 4 i feel suspicious if there is no one outside l... neutral" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('/kaggle/input/emotion-analysis-based-on-text/emotion_sentimen_dataset.csv', encoding='utf-8')\n", "df.head()" ] }, { "cell_type": "markdown", "id": "faba2959", "metadata": { "papermill": { "duration": 0.011328, "end_time": "2024-06-08T05:23:08.674710", "exception": false, "start_time": "2024-06-08T05:23:08.663382", "status": "completed" }, "tags": [] }, "source": [ "## 对情绪标签编码" ] }, { "cell_type": "code", "execution_count": 3, "id": "b5e83375", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:08.698154Z", "iopub.status.busy": "2024-06-08T05:23:08.697788Z", "iopub.status.idle": "2024-06-08T05:23:08.930494Z", "shell.execute_reply": "2024-06-08T05:23:08.929386Z" }, "papermill": { "duration": 0.247874, "end_time": "2024-06-08T05:23:08.933292", "exception": false, "start_time": "2024-06-08T05:23:08.685418", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(839555,)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label_encoder = LabelEncoder()\n", "\n", "emotion_encoded = label_encoder.fit_transform(df['Emotion'])\n", "emotion_encoded.shape" ] }, { "cell_type": "markdown", "id": "161c9099", "metadata": { "papermill": { "duration": 0.011352, "end_time": "2024-06-08T05:23:08.957289", "exception": false, "start_time": "2024-06-08T05:23:08.945937", "status": "completed" }, "tags": [] }, "source": [ "## 预测" ] }, { "cell_type": "code", "execution_count": 4, "id": "63682798", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:08.981965Z", "iopub.status.busy": "2024-06-08T05:23:08.981632Z", "iopub.status.idle": "2024-06-08T05:23:08.990389Z", "shell.execute_reply": "2024-06-08T05:23:08.989469Z" }, "papermill": { "duration": 0.023631, "end_time": "2024-06-08T05:23:08.992489", "exception": false, "start_time": "2024-06-08T05:23:08.968858", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# 该词汇表由Sima Anjali提供\n", "top3_words_per_class = {\n", " 'empty': ['empty', 'void', 'hollow'],\n", " 'sadness': ['sad', 'melancholy', 'depressed'],\n", " 'enthusiasm': ['enthusiastic', 'excited', 'eager'],\n", " 'neutral': ['neutral', 'indifferent', 'unbiased'],\n", " 'worry': ['worry', 'anxiety', 'concern'],\n", " 'surprise': ['surprise', 'astonishment', 'shock'],\n", " 'love': ['love', 'affection', 'adoration'],\n", " 'fun': ['fun', 'joyful', 'amusing'],\n", " 'hate': ['hate', 'detest', 'loathe'],\n", " 'happiness': ['happy', 'joy', 'content'],\n", " 'boredom': ['boredom', 'tedium', 'monotony'],\n", " 'relief': ['relief', 'ease', 'comfort'],\n", " 'anger': ['angry', 'rage', 'outrage']\n", "}\n", "\n", "def predict_emotion(comment):\n", " for emotion, keywords in top3_words_per_class.items():\n", " if any(keyword in comment.lower() for keyword in keywords):\n", " return emotion\n", " return \"neutral\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "c12c07da", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:09.017368Z", "iopub.status.busy": "2024-06-08T05:23:09.016618Z", "iopub.status.idle": "2024-06-08T05:23:13.305818Z", "shell.execute_reply": "2024-06-08T05:23:13.304644Z" }, "papermill": { "duration": 4.304215, "end_time": "2024-06-08T05:23:13.308313", "exception": false, "start_time": "2024-06-08T05:23:09.004098", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 1.0\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " anger 1.00 1.00 1.00 2461\n", " boredom 1.00 1.00 1.00 30\n", " empty 1.00 1.00 1.00 1153\n", " enthusiasm 1.00 1.00 1.00 1877\n", " fun 1.00 1.00 1.00 2055\n", " happiness 1.00 1.00 1.00 5364\n", " hate 1.00 1.00 1.00 3012\n", " love 1.00 1.00 1.00 8045\n", " neutral 1.00 1.00 1.00 134788\n", " relief 1.00 1.00 1.00 3373\n", " sadness 1.00 1.00 1.00 3535\n", " surprise 1.00 1.00 1.00 1356\n", " worry 1.00 1.00 1.00 862\n", "\n", " accuracy 1.00 167911\n", " macro avg 1.00 1.00 1.00 167911\n", "weighted avg 1.00 1.00 1.00 167911\n", "\n" ] } ], "source": [ "# 确定top3_words_per_class没有运用最后20%的数据\n", "split = int(len(df['text']) * 0.8)\n", "test_texts = df['text'][split:]\n", "test_labels = df['Emotion'][split:]\n", "\n", "# 预测\n", "predicted_emotions = test_texts.apply(lambda x: predict_emotion(x))\n", "test_labels_encoded = label_encoder.transform(test_labels)\n", "predicted_emotions_encoded = label_encoder.transform(predicted_emotions)\n", "accuracy = accuracy_score(test_labels_encoded, predicted_emotions_encoded)\n", "report = classification_report(test_labels_encoded, predicted_emotions_encoded, target_names=label_encoder.classes_)\n", "\n", "print('Accuracy:', accuracy)\n", "print('Classification Report:')\n", "print(report)" ] }, { "cell_type": "code", "execution_count": 6, "id": "06ad6c4f", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:13.334840Z", "iopub.status.busy": "2024-06-08T05:23:13.334489Z", "iopub.status.idle": "2024-06-08T05:23:13.342566Z", "shell.execute_reply": "2024-06-08T05:23:13.341504Z" }, "papermill": { "duration": 0.023472, "end_time": "2024-06-08T05:23:13.344745", "exception": false, "start_time": "2024-06-08T05:23:13.321273", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# 自己计算得到的top3_words_per_class\n", "top3_words_per_class = {\n", " 'anger': ['word', 'strong', 'anger'],\n", " 'boredom': ['work', 'tortured', 'read'],\n", " 'empty': ['want', 'try', 'trying'],\n", " 'enthusiasm': ['eager', 'loved', 'happy'],\n", " 'fun': ['doomed', 'brain', 'normal'],\n", " 'happiness': ['cool', 'content', 'contented'],\n", " 'hate': ['towards', 'hate', 'today'],\n", " 'love': ['love', 'toward', 'hated'],\n", " 'neutral': ['invigorated', 'valuable', 'shaky'],\n", " 'relief': ['comfort', 'uncomfortable', 'pain'],\n", " 'sadness': ['worry', 'excited', 'sadness'],\n", " 'surprise': ['unpleasant', 'joyful', 'shocked'],\n", " 'worry': ['worried', 'issues', 'questions']\n", "}" ] }, { "cell_type": "code", "execution_count": 7, "id": "7c7a20d0", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:13.370388Z", "iopub.status.busy": "2024-06-08T05:23:13.369555Z", "iopub.status.idle": "2024-06-08T05:23:17.017386Z", "shell.execute_reply": "2024-06-08T05:23:17.016139Z" }, "papermill": { "duration": 3.663146, "end_time": "2024-06-08T05:23:17.019737", "exception": false, "start_time": "2024-06-08T05:23:13.356591", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7027532442782188\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " anger 0.02 0.04 0.03 2461\n", " boredom 0.00 0.13 0.00 30\n", " empty 0.01 0.12 0.02 1153\n", " enthusiasm 0.10 0.29 0.15 1877\n", " fun 0.02 0.01 0.02 2055\n", " happiness 0.59 0.22 0.32 5364\n", " hate 0.40 0.79 0.53 3012\n", " love 0.91 0.49 0.64 8045\n", " neutral 0.92 0.80 0.85 134788\n", " relief 0.55 0.47 0.51 3373\n", " sadness 0.16 0.06 0.09 3535\n", " surprise 0.39 0.34 0.36 1356\n", " worry 0.02 0.02 0.02 862\n", "\n", " accuracy 0.70 167911\n", " macro avg 0.31 0.29 0.27 167911\n", "weighted avg 0.83 0.70 0.75 167911\n", "\n" ] } ], "source": [ "# 预测\n", "predicted_emotions = test_texts.apply(lambda x: predict_emotion(x))\n", "test_labels_encoded = label_encoder.transform(test_labels)\n", "predicted_emotions_encoded = label_encoder.transform(predicted_emotions)\n", "accuracy = accuracy_score(test_labels_encoded, predicted_emotions_encoded)\n", "report = classification_report(test_labels_encoded, predicted_emotions_encoded, target_names=label_encoder.classes_)\n", "\n", "print('Accuracy:', accuracy)\n", "print('Classification Report:')\n", "print(report)" ] }, { "cell_type": "markdown", "id": "3d410b88", "metadata": { "papermill": { "duration": 0.011311, "end_time": "2024-06-08T05:23:17.042637", "exception": false, "start_time": "2024-06-08T05:23:17.031326", "status": "completed" }, "tags": [] }, "source": [ "Sima Anjali 的词汇表通过观察数据容易得到, 确实非常巧妙, 原文[链接](https://www.kaggle.com/code/simaanjali/sentiment-analysis) ,但是我希望用一种更通用的方式获得top3_words_per_class, 下面是我的一些尝试, 最后翻车了..." ] }, { "cell_type": "markdown", "id": "e86265bd", "metadata": { "papermill": { "duration": 0.011214, "end_time": "2024-06-08T05:23:17.065375", "exception": false, "start_time": "2024-06-08T05:23:17.054161", "status": "completed" }, "tags": [] }, "source": [ "## 数据预处理" ] }, { "cell_type": "markdown", "id": "c1c71821", "metadata": { "papermill": { "duration": 0.011364, "end_time": "2024-06-08T05:23:17.088406", "exception": false, "start_time": "2024-06-08T05:23:17.077042", "status": "completed" }, "tags": [] }, "source": [ "### 去掉'Unnamed:0'列" ] }, { "cell_type": "code", "execution_count": 8, "id": "cdff5bf0", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:17.113369Z", "iopub.status.busy": "2024-06-08T05:23:17.113012Z", "iopub.status.idle": "2024-06-08T05:23:17.122983Z", "shell.execute_reply": "2024-06-08T05:23:17.121993Z" }, "papermill": { "duration": 0.024995, "end_time": "2024-06-08T05:23:17.125067", "exception": false, "start_time": "2024-06-08T05:23:17.100072", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'i seriously hate one subject to death but now i feel reluctant to drop it'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if 'Unnamed: 0' in df.columns:\n", " del df['Unnamed: 0']\n", "df.loc[0]['text']" ] }, { "cell_type": "markdown", "id": "61ae191d", "metadata": { "papermill": { "duration": 0.011624, "end_time": "2024-06-08T05:23:17.148755", "exception": false, "start_time": "2024-06-08T05:23:17.137131", "status": "completed" }, "tags": [] }, "source": [ "### 检查缺失值" ] }, { "cell_type": "code", "execution_count": 9, "id": "414114e3", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:17.174199Z", "iopub.status.busy": "2024-06-08T05:23:17.173501Z", "iopub.status.idle": "2024-06-08T05:23:17.358895Z", "shell.execute_reply": "2024-06-08T05:23:17.357895Z" }, "papermill": { "duration": 0.200364, "end_time": "2024-06-08T05:23:17.361025", "exception": false, "start_time": "2024-06-08T05:23:17.160661", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "text False\n", "Emotion False\n", "dtype: bool" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().any()" ] }, { "cell_type": "markdown", "id": "ac15176e", "metadata": { "papermill": { "duration": 0.011211, "end_time": "2024-06-08T05:23:17.384265", "exception": false, "start_time": "2024-06-08T05:23:17.373054", "status": "completed" }, "tags": [] }, "source": [ "没有缺失值" ] }, { "cell_type": "markdown", "id": "6821ea4e", "metadata": { "papermill": { "duration": 0.011459, "end_time": "2024-06-08T05:23:17.407497", "exception": false, "start_time": "2024-06-08T05:23:17.396038", "status": "completed" }, "tags": [] }, "source": [ "### 统计标签个数" ] }, { "cell_type": "code", "execution_count": 10, "id": "d82f1289", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:17.432218Z", "iopub.status.busy": "2024-06-08T05:23:17.431428Z", "iopub.status.idle": "2024-06-08T05:23:17.572340Z", "shell.execute_reply": "2024-06-08T05:23:17.571339Z" }, "papermill": { "duration": 0.15559, "end_time": "2024-06-08T05:23:17.574515", "exception": false, "start_time": "2024-06-08T05:23:17.418925", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Emotion\n", "neutral 674538\n", "love 39553\n", "happiness 27175\n", "sadness 17481\n", "relief 16729\n", "hate 15267\n", "anger 12336\n", "fun 10075\n", "enthusiasm 9304\n", "surprise 6954\n", "empty 5542\n", "worry 4475\n", "boredom 126\n", "Name: count, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Emotion'].value_counts()" ] }, { "cell_type": "markdown", "id": "1c6dda6f", "metadata": { "papermill": { "duration": 0.010949, "end_time": "2024-06-08T05:23:17.597138", "exception": false, "start_time": "2024-06-08T05:23:17.586189", "status": "completed" }, "tags": [] }, "source": [ "neutral情绪居多" ] }, { "cell_type": "markdown", "id": "5b4e35fa", "metadata": { "papermill": { "duration": 0.011264, "end_time": "2024-06-08T05:23:17.619973", "exception": false, "start_time": "2024-06-08T05:23:17.608709", "status": "completed" }, "tags": [] }, "source": [ "### 过滤HTML" ] }, { "cell_type": "code", "execution_count": 11, "id": "b81a048f", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:17.644904Z", "iopub.status.busy": "2024-06-08T05:23:17.644515Z", "iopub.status.idle": "2024-06-08T05:23:17.649773Z", "shell.execute_reply": "2024-06-08T05:23:17.648770Z" }, "papermill": { "duration": 0.019875, "end_time": "2024-06-08T05:23:17.651971", "exception": false, "start_time": "2024-06-08T05:23:17.632096", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def noiseremovel_text(text):\n", " soup = BeautifulSoup(text, \"html.parser\")\n", " text = soup.get_text()\n", " text = re.sub(r'\\[[^]]*\\]', '', text) \n", " return text" ] }, { "cell_type": "code", "execution_count": 12, "id": "898231fd", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:17.676917Z", "iopub.status.busy": "2024-06-08T05:23:17.676337Z", "iopub.status.idle": "2024-06-08T05:23:17.683051Z", "shell.execute_reply": "2024-06-08T05:23:17.682233Z" }, "papermill": { "duration": 0.021283, "end_time": "2024-06-08T05:23:17.684999", "exception": false, "start_time": "2024-06-08T05:23:17.663716", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'I really enjoyed the latest episode of my favorite show! Check out this link for a recap. #bestshowever '" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_text = '
I really enjoyed the latest episode of my favorite show! [https://t.co/xyz123] Check out this link for a recap. #bestshowever [Ad: Stream now on MyStreamingService for 50% off!]
'\n", "trans_sample_text = noiseremovel_text(sample_text)\n", "trans_sample_text" ] }, { "cell_type": "code", "execution_count": 13, "id": "0d72d5c9", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:23:17.709551Z", "iopub.status.busy": "2024-06-08T05:23:17.709183Z", "iopub.status.idle": "2024-06-08T05:24:03.532030Z", "shell.execute_reply": "2024-06-08T05:24:03.530933Z" }, "papermill": { "duration": 45.837974, "end_time": "2024-06-08T05:24:03.534723", "exception": false, "start_time": "2024-06-08T05:23:17.696749", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df['text'] = df['text'].apply(noiseremovel_text)" ] }, { "cell_type": "markdown", "id": "bf25f79c", "metadata": { "papermill": { "duration": 0.011823, "end_time": "2024-06-08T05:24:03.559258", "exception": false, "start_time": "2024-06-08T05:24:03.547435", "status": "completed" }, "tags": [] }, "source": [ "### 移除stopwords" ] }, { "cell_type": "code", "execution_count": 14, "id": "9d38274f", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:24:03.585450Z", "iopub.status.busy": "2024-06-08T05:24:03.585072Z", "iopub.status.idle": "2024-06-08T05:24:03.676534Z", "shell.execute_reply": "2024-06-08T05:24:03.675293Z" }, "papermill": { "duration": 0.107285, "end_time": "2024-06-08T05:24:03.679106", "exception": false, "start_time": "2024-06-08T05:24:03.571821", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "nltk.download('stopwords')\n", "stopwords = nltk.corpus.stopwords.words('english')\n", "stop_wr = set(stopwords)\n", "\n", "def remove_stopwords(text, stop_words):\n", " tokenizers = ToktokTokenizer()\n", " #提取单词和缩写\n", " words = re.findall(r'\\w+|\\.\\.+', text)\n", " stop_words = set(stop_words)\n", " filtokens = [i for i in words if i.lower() not in stop_words]\n", " # 连接\n", " filtered_text = ' '.join(filtokens)\n", " return filtered_text" ] }, { "cell_type": "code", "execution_count": 15, "id": "57c9cf73", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:24:03.705606Z", "iopub.status.busy": "2024-06-08T05:24:03.705244Z", "iopub.status.idle": "2024-06-08T05:24:03.711894Z", "shell.execute_reply": "2024-06-08T05:24:03.710852Z" }, "papermill": { "duration": 0.023006, "end_time": "2024-06-08T05:24:03.714544", "exception": false, "start_time": "2024-06-08T05:24:03.691538", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'seriously hate one subject death feel reluctant drop'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_text = 'i seriously hate one subject to death but now i feel reluctant to drop it'\n", "trans_sample_text = remove_stopwords(sample_text, stop_wr)\n", "trans_sample_text" ] }, { "cell_type": "code", "execution_count": 16, "id": "b17dda19", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:24:03.739617Z", "iopub.status.busy": "2024-06-08T05:24:03.739057Z", "iopub.status.idle": "2024-06-08T05:24:19.365883Z", "shell.execute_reply": "2024-06-08T05:24:19.365027Z" }, "papermill": { "duration": 15.64211, "end_time": "2024-06-08T05:24:19.368550", "exception": false, "start_time": "2024-06-08T05:24:03.726440", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df['text'] = df['text'].apply(remove_stopwords, stop_words=stop_wr)" ] }, { "cell_type": "markdown", "id": "44f136c5", "metadata": { "papermill": { "duration": 0.012191, "end_time": "2024-06-08T05:24:19.393559", "exception": false, "start_time": "2024-06-08T05:24:19.381368", "status": "completed" }, "tags": [] }, "source": [ "## 获取每个类别的最重要的关键词top10" ] }, { "cell_type": "code", "execution_count": 17, "id": "64b126b5", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:24:19.419737Z", "iopub.status.busy": "2024-06-08T05:24:19.419349Z", "iopub.status.idle": "2024-06-08T05:27:35.741652Z", "shell.execute_reply": "2024-06-08T05:27:35.740529Z" }, "papermill": { "duration": 196.369291, "end_time": "2024-06-08T05:27:35.774860", "exception": false, "start_time": "2024-06-08T05:24:19.405569", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression(max_iter=1000)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#使用TF-IDF进行文本向量化\n", "vectorizer = TfidfVectorizer(max_features=1000)\n", "X_tfidf = vectorizer.fit_transform(df['text'])\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X_tfidf, emotion_encoded, test_size=0.2, random_state=42)\n", "\n", "model = LogisticRegression(max_iter=1000)\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 18, "id": "20e50fb7", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:27:35.832932Z", "iopub.status.busy": "2024-06-08T05:27:35.832299Z", "iopub.status.idle": "2024-06-08T05:27:36.137172Z", "shell.execute_reply": "2024-06-08T05:27:36.136281Z" }, "papermill": { "duration": 0.337164, "end_time": "2024-06-08T05:27:36.139818", "exception": false, "start_time": "2024-06-08T05:27:35.802654", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9709548510818231\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.64 0.77 2489\n", " 1 0.00 0.00 0.00 21\n", " 2 1.00 0.62 0.76 1096\n", " 3 1.00 0.96 0.98 1839\n", " 4 0.98 0.84 0.91 1977\n", " 5 0.98 0.89 0.94 5370\n", " 6 0.98 0.90 0.94 3018\n", " 7 0.99 0.93 0.96 8001\n", " 8 0.97 1.00 0.98 134999\n", " 9 0.98 0.73 0.84 3396\n", " 10 1.00 0.92 0.96 3428\n", " 11 0.99 0.86 0.92 1372\n", " 12 0.99 0.57 0.73 905\n", "\n", " accuracy 0.97 167911\n", " macro avg 0.91 0.76 0.82 167911\n", "weighted avg 0.97 0.97 0.97 167911\n", "\n" ] } ], "source": [ "y_pred = model.predict(X_test)\n", "print('Accuracy:', accuracy_score(y_test, y_pred))\n", "\n", "print(\"Classification Report:\")\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 19, "id": "94c44c84", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:27:36.167246Z", "iopub.status.busy": "2024-06-08T05:27:36.166470Z", "iopub.status.idle": "2024-06-08T05:27:36.175612Z", "shell.execute_reply": "2024-06-08T05:27:36.174381Z" }, "papermill": { "duration": 0.025314, "end_time": "2024-06-08T05:27:36.177947", "exception": false, "start_time": "2024-06-08T05:27:36.152633", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "anger: angry, discouraged, feel, gives, upon, happening, support, anger, strong, word\n", "boredom: tortured, dull, dissatisfied, productive, ignored, uncertain, read, work, anything, blank\n", "empty: empty, sad, sadness, love, try, trying, want, aching, tend, depressed\n", "enthusiasm: excited, eager, love, loved, happy, passionate, peaceful, surprised, appreciative, hopeful\n", "fun: funny, fun, joyful, doomed, joy, brain, opportunity, happy, normal, money\n", "happiness: happy, enjoy, unhappy, content, discontent, joy, contented, cool, useful, piece\n", "hate: hate, whatever, hated, towards, comments, angry, feel, today, person, toward\n", "love: love, loved, beloved, lovely, unloved, toward, towards, hate, hated, happy\n", "neutral: adventurous, blessed, bitchy, restless, rude, valuable, overwhelmed, shaky, invigorated, horny\n", "relief: comfortable, uncomfortable, pleased, please, comfort, sweet, sense, pain, relaxed, restless\n", "sadness: sad, depressed, melancholy, sadness, love, shocked, excited, deeply, worry, feel\n", "surprise: surprised, shocked, love, pleasant, lovely, loved, unpleasant, party, joyful, generous\n", "worry: worry, anxiety, love, far, worried, issues, selfish, petty, questions, caring\n" ] } ], "source": [ "# 获取每个类别的最重要的关键词\n", "feature_names = vectorizer.get_feature_names_out()\n", "for i, class_label in enumerate(label_encoder.classes_):\n", " tops = model.coef_[i].argsort()[-10:][::-1]\n", " print(f\"{class_label}: {', '.join([feature_names[idx] for idx in tops])}\")" ] }, { "cell_type": "markdown", "id": "09b2a27e", "metadata": { "papermill": { "duration": 0.01225, "end_time": "2024-06-08T05:27:36.203284", "exception": false, "start_time": "2024-06-08T05:27:36.191034", "status": "completed" }, "tags": [] }, "source": [ "## 运用卡方检测筛选top3" ] }, { "cell_type": "code", "execution_count": 20, "id": "a943a202", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:27:36.230609Z", "iopub.status.busy": "2024-06-08T05:27:36.229925Z", "iopub.status.idle": "2024-06-08T05:27:36.237886Z", "shell.execute_reply": "2024-06-08T05:27:36.237062Z" }, "papermill": { "duration": 0.023883, "end_time": "2024-06-08T05:27:36.239991", "exception": false, "start_time": "2024-06-08T05:27:36.216108", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# 获取每个类别的最重要的关键词\n", "feature_names = vectorizer.get_feature_names_out()\n", "top_words = {}\n", "for i, class_label in enumerate(label_encoder.classes_):\n", " tops = model.coef_[i].argsort()[-10:][::-1]\n", " top_words[class_label] = [feature_names[j] for j in tops]" ] }, { "cell_type": "code", "execution_count": 21, "id": "8372df21", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:27:36.268429Z", "iopub.status.busy": "2024-06-08T05:27:36.268047Z", "iopub.status.idle": "2024-06-08T05:28:35.516736Z", "shell.execute_reply": "2024-06-08T05:28:35.515692Z" }, "papermill": { "duration": 59.266701, "end_time": "2024-06-08T05:28:35.519498", "exception": false, "start_time": "2024-06-08T05:27:36.252797", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# 合并所有情绪类别的top10词汇\n", "all_top_words = set(word for words in top_words.values() for word in words)\n", "\n", "split = int(len(df['text'])*0.8)\n", "\n", "# 使用这些词作为特征进行向量化\n", "vectorizer = CountVectorizer(vocabulary=all_top_words)\n", "X = vectorizer.fit_transform(df['text'][:split])\n", "features = vectorizer.get_feature_names_out()\n", "\n", "# 创建频率矩阵\n", "word_counts = np.zeros((len(label_encoder.classes_), len(features)))\n", "for i, text in enumerate(X):\n", " emotion_index = y_train[i] if i < len(y_train) else y_test[i - len(y_train)]\n", " word_counts[emotion_index] += text.toarray()[0]" ] }, { "cell_type": "code", "execution_count": 22, "id": "d20b693d", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:28:35.547702Z", "iopub.status.busy": "2024-06-08T05:28:35.547307Z", "iopub.status.idle": "2024-06-08T05:28:35.595153Z", "shell.execute_reply": "2024-06-08T05:28:35.594241Z" }, "papermill": { "duration": 0.064429, "end_time": "2024-06-08T05:28:35.597579", "exception": false, "start_time": "2024-06-08T05:28:35.533150", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# 进行卡方检验\n", "chi2_results = []\n", "for i, word in enumerate(features):\n", " # 构建一个2x2的表格来计算每个词的卡方值\n", " contingency_table = np.array([word_counts[:, i], np.sum(word_counts, axis=1) - word_counts[:, i]]).T\n", " chi2, p, dof, ex = chi2_contingency(contingency_table)\n", " chi2_results.append((word, chi2, p))\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "aaa08844", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:28:35.626231Z", "iopub.status.busy": "2024-06-08T05:28:35.625871Z", "iopub.status.idle": "2024-06-08T05:28:35.639429Z", "shell.execute_reply": "2024-06-08T05:28:35.638498Z" }, "papermill": { "duration": 0.031212, "end_time": "2024-06-08T05:28:35.642109", "exception": false, "start_time": "2024-06-08T05:28:35.610897", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Word Chi2 Statistic p-value\n", "26 eager 25.034678 0.014659\n", "18 cool 24.842598 0.015587\n", "24 doomed 24.435655 0.017736\n", "11 brain 23.400643 0.024511\n", "43 invigorated 22.573347 0.031574\n", ".. ... ... ...\n", "9 blank 4.887556 0.961629\n", "61 piece 4.496386 0.972726\n", "80 sweet 4.249451 0.978503\n", "35 gives 3.867413 0.985712\n", "1 adventurous 2.220789 0.998985\n", "\n", "[102 rows x 3 columns]\n" ] } ], "source": [ "# 输出结果\n", "chi2_df = pd.DataFrame(chi2_results, columns=['Word', 'Chi2 Statistic', 'p-value'])\n", "print(chi2_df.sort_values(by='p-value'))" ] }, { "cell_type": "code", "execution_count": 24, "id": "f4b13938", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:28:35.671275Z", "iopub.status.busy": "2024-06-08T05:28:35.670897Z", "iopub.status.idle": "2024-06-08T05:28:35.689349Z", "shell.execute_reply": "2024-06-08T05:28:35.688498Z" }, "papermill": { "duration": 0.036, "end_time": "2024-06-08T05:28:35.691802", "exception": false, "start_time": "2024-06-08T05:28:35.655802", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# 删除重复关键词并获取每个标签的top3关键词\n", "selected_words = set()\n", "top3_words_per_class = {}\n", "for class_label in label_encoder.classes_:\n", " class_df = chi2_df[chi2_df['Word'].isin(top_words[class_label])]\n", " class_df = class_df.sort_values(by='p-value')\n", " top3_words = []\n", " for word in class_df['Word']:\n", " if word not in selected_words:\n", " top3_words.append(word)\n", " selected_words.add(word)\n", " if len(top3_words) == 3:\n", " break\n", " top3_words_per_class[class_label] = top3_words\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "1569abf8", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:28:35.721410Z", "iopub.status.busy": "2024-06-08T05:28:35.721021Z", "iopub.status.idle": "2024-06-08T05:28:35.726801Z", "shell.execute_reply": "2024-06-08T05:28:35.725762Z" }, "papermill": { "duration": 0.02376, "end_time": "2024-06-08T05:28:35.729446", "exception": false, "start_time": "2024-06-08T05:28:35.705686", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "anger: word, strong, anger\n", "boredom: work, tortured, read\n", "empty: want, try, trying\n", "enthusiasm: eager, loved, happy\n", "fun: doomed, brain, normal\n", "happiness: cool, content, contented\n", "hate: towards, hate, today\n", "love: love, toward, hated\n", "neutral: invigorated, valuable, shaky\n", "relief: comfort, uncomfortable, pain\n", "sadness: worry, excited, sadness\n", "surprise: unpleasant, joyful, shocked\n", "worry: worried, issues, questions\n" ] } ], "source": [ "# 输出结果\n", "for class_label, words in top3_words_per_class.items():\n", " print(f\"{class_label}: {', '.join(words)}\")" ] }, { "cell_type": "code", "execution_count": 26, "id": "3713ea76", "metadata": { "execution": { "iopub.execute_input": "2024-06-08T05:28:35.758343Z", "iopub.status.busy": "2024-06-08T05:28:35.757518Z", "iopub.status.idle": "2024-06-08T05:28:35.765106Z", "shell.execute_reply": "2024-06-08T05:28:35.764191Z" }, "papermill": { "duration": 0.024201, "end_time": "2024-06-08T05:28:35.767261", "exception": false, "start_time": "2024-06-08T05:28:35.743060", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'anger': ['word', 'strong', 'anger'],\n", " 'boredom': ['work', 'tortured', 'read'],\n", " 'empty': ['want', 'try', 'trying'],\n", " 'enthusiasm': ['eager', 'loved', 'happy'],\n", " 'fun': ['doomed', 'brain', 'normal'],\n", " 'happiness': ['cool', 'content', 'contented'],\n", " 'hate': ['towards', 'hate', 'today'],\n", " 'love': ['love', 'toward', 'hated'],\n", " 'neutral': ['invigorated', 'valuable', 'shaky'],\n", " 'relief': ['comfort', 'uncomfortable', 'pain'],\n", " 'sadness': ['worry', 'excited', 'sadness'],\n", " 'surprise': ['unpleasant', 'joyful', 'shocked'],\n", " 'worry': ['worried', 'issues', 'questions']}" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top3_words_per_class" ] }, { "cell_type": "code", "execution_count": null, "id": "98ed9e0d", "metadata": { "papermill": { "duration": 0.013978, "end_time": "2024-06-08T05:28:35.856734", "exception": false, "start_time": "2024-06-08T05:28:35.842756", "status": "completed" }, "tags": [] }, "outputs": [], "source": [] } ], "metadata": { "kaggle": { "accelerator": "none", "dataSources": [ { "datasetId": 4540583, "sourceId": 7763359, "sourceType": "datasetVersion" } ], "dockerImageVersionId": 30732, "isGpuEnabled": false, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" }, "papermill": { "default_parameters": {}, "duration": 336.058127, "end_time": "2024-06-08T05:28:36.592538", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2024-06-08T05:23:00.534411", "version": "2.5.0" } }, "nbformat": 4, "nbformat_minor": 5 }