{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "aa5c1491",
   "metadata": {
    "papermill": {
     "duration": 0.012997,
     "end_time": "2024-06-08T05:23:03.349402",
     "exception": false,
     "start_time": "2024-06-08T05:23:03.336405",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 导入库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2ff051d2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:03.375582Z",
     "iopub.status.busy": "2024-06-08T05:23:03.375090Z",
     "iopub.status.idle": "2024-06-08T05:23:06.135626Z",
     "shell.execute_reply": "2024-06-08T05:23:06.134675Z"
    },
    "papermill": {
     "duration": 2.776387,
     "end_time": "2024-06-08T05:23:06.138287",
     "exception": false,
     "start_time": "2024-06-08T05:23:03.361900",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import nltk\n",
    "from nltk.corpus.reader.tagged import ToktokTokenizer\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "from scipy.stats import chi2_contingency\n",
    "\n",
    "import warnings\n",
    "from sklearn.exceptions import UndefinedMetricWarning\n",
    "warnings.filterwarnings(\"ignore\", category=UndefinedMetricWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afa5b530",
   "metadata": {
    "papermill": {
     "duration": 0.012953,
     "end_time": "2024-06-08T05:23:06.163346",
     "exception": false,
     "start_time": "2024-06-08T05:23:06.150393",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fa696287",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:06.188980Z",
     "iopub.status.busy": "2024-06-08T05:23:06.187975Z",
     "iopub.status.idle": "2024-06-08T05:23:08.649105Z",
     "shell.execute_reply": "2024-06-08T05:23:08.647767Z"
    },
    "papermill": {
     "duration": 2.476289,
     "end_time": "2024-06-08T05:23:08.651497",
     "exception": false,
     "start_time": "2024-06-08T05:23:06.175208",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>text</th>\n",
       "      <th>Emotion</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>i seriously hate one subject to death but now ...</td>\n",
       "      <td>hate</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>im so full of life i feel appalled</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>i sit here to write i start to dig out my feel...</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>ive been really angry with r and i feel like a...</td>\n",
       "      <td>anger</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>i feel suspicious if there is no one outside l...</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                                               text  Emotion\n",
       "0           0  i seriously hate one subject to death but now ...     hate\n",
       "1           1                 im so full of life i feel appalled  neutral\n",
       "2           2  i sit here to write i start to dig out my feel...  neutral\n",
       "3           3  ive been really angry with r and i feel like a...    anger\n",
       "4           4  i feel suspicious if there is no one outside l...  neutral"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('/kaggle/input/emotion-analysis-based-on-text/emotion_sentimen_dataset.csv', encoding='utf-8')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "faba2959",
   "metadata": {
    "papermill": {
     "duration": 0.011328,
     "end_time": "2024-06-08T05:23:08.674710",
     "exception": false,
     "start_time": "2024-06-08T05:23:08.663382",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 对情绪标签编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b5e83375",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:08.698154Z",
     "iopub.status.busy": "2024-06-08T05:23:08.697788Z",
     "iopub.status.idle": "2024-06-08T05:23:08.930494Z",
     "shell.execute_reply": "2024-06-08T05:23:08.929386Z"
    },
    "papermill": {
     "duration": 0.247874,
     "end_time": "2024-06-08T05:23:08.933292",
     "exception": false,
     "start_time": "2024-06-08T05:23:08.685418",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(839555,)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_encoder = LabelEncoder()\n",
    "\n",
    "emotion_encoded = label_encoder.fit_transform(df['Emotion'])\n",
    "emotion_encoded.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "161c9099",
   "metadata": {
    "papermill": {
     "duration": 0.011352,
     "end_time": "2024-06-08T05:23:08.957289",
     "exception": false,
     "start_time": "2024-06-08T05:23:08.945937",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "63682798",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:08.981965Z",
     "iopub.status.busy": "2024-06-08T05:23:08.981632Z",
     "iopub.status.idle": "2024-06-08T05:23:08.990389Z",
     "shell.execute_reply": "2024-06-08T05:23:08.989469Z"
    },
    "papermill": {
     "duration": 0.023631,
     "end_time": "2024-06-08T05:23:08.992489",
     "exception": false,
     "start_time": "2024-06-08T05:23:08.968858",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 该词汇表由Sima Anjali提供\n",
    "top3_words_per_class = {\n",
    "    'empty': ['empty', 'void', 'hollow'],\n",
    "    'sadness': ['sad', 'melancholy', 'depressed'],\n",
    "    'enthusiasm': ['enthusiastic', 'excited', 'eager'],\n",
    "    'neutral': ['neutral', 'indifferent', 'unbiased'],\n",
    "    'worry': ['worry', 'anxiety', 'concern'],\n",
    "    'surprise': ['surprise', 'astonishment', 'shock'],\n",
    "    'love': ['love', 'affection', 'adoration'],\n",
    "    'fun': ['fun', 'joyful', 'amusing'],\n",
    "    'hate': ['hate', 'detest', 'loathe'],\n",
    "    'happiness': ['happy', 'joy', 'content'],\n",
    "    'boredom': ['boredom', 'tedium', 'monotony'],\n",
    "    'relief': ['relief', 'ease', 'comfort'],\n",
    "    'anger': ['angry', 'rage', 'outrage']\n",
    "}\n",
    "\n",
    "def predict_emotion(comment):\n",
    "    for emotion, keywords in top3_words_per_class.items():\n",
    "        if any(keyword in comment.lower() for keyword in keywords):\n",
    "            return emotion\n",
    "    return \"neutral\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c12c07da",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:09.017368Z",
     "iopub.status.busy": "2024-06-08T05:23:09.016618Z",
     "iopub.status.idle": "2024-06-08T05:23:13.305818Z",
     "shell.execute_reply": "2024-06-08T05:23:13.304644Z"
    },
    "papermill": {
     "duration": 4.304215,
     "end_time": "2024-06-08T05:23:13.308313",
     "exception": false,
     "start_time": "2024-06-08T05:23:09.004098",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 1.0\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       anger       1.00      1.00      1.00      2461\n",
      "     boredom       1.00      1.00      1.00        30\n",
      "       empty       1.00      1.00      1.00      1153\n",
      "  enthusiasm       1.00      1.00      1.00      1877\n",
      "         fun       1.00      1.00      1.00      2055\n",
      "   happiness       1.00      1.00      1.00      5364\n",
      "        hate       1.00      1.00      1.00      3012\n",
      "        love       1.00      1.00      1.00      8045\n",
      "     neutral       1.00      1.00      1.00    134788\n",
      "      relief       1.00      1.00      1.00      3373\n",
      "     sadness       1.00      1.00      1.00      3535\n",
      "    surprise       1.00      1.00      1.00      1356\n",
      "       worry       1.00      1.00      1.00       862\n",
      "\n",
      "    accuracy                           1.00    167911\n",
      "   macro avg       1.00      1.00      1.00    167911\n",
      "weighted avg       1.00      1.00      1.00    167911\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 确定top3_words_per_class没有运用最后20%的数据\n",
    "split = int(len(df['text']) * 0.8)\n",
    "test_texts = df['text'][split:]\n",
    "test_labels = df['Emotion'][split:]\n",
    "\n",
    "# 预测\n",
    "predicted_emotions = test_texts.apply(lambda x: predict_emotion(x))\n",
    "test_labels_encoded = label_encoder.transform(test_labels)\n",
    "predicted_emotions_encoded = label_encoder.transform(predicted_emotions)\n",
    "accuracy = accuracy_score(test_labels_encoded, predicted_emotions_encoded)\n",
    "report = classification_report(test_labels_encoded, predicted_emotions_encoded, target_names=label_encoder.classes_)\n",
    "\n",
    "print('Accuracy:', accuracy)\n",
    "print('Classification Report:')\n",
    "print(report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "06ad6c4f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:13.334840Z",
     "iopub.status.busy": "2024-06-08T05:23:13.334489Z",
     "iopub.status.idle": "2024-06-08T05:23:13.342566Z",
     "shell.execute_reply": "2024-06-08T05:23:13.341504Z"
    },
    "papermill": {
     "duration": 0.023472,
     "end_time": "2024-06-08T05:23:13.344745",
     "exception": false,
     "start_time": "2024-06-08T05:23:13.321273",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 自己计算得到的top3_words_per_class\n",
    "top3_words_per_class = {\n",
    "       'anger': ['word', 'strong', 'anger'],\n",
    "       'boredom': ['work', 'tortured', 'read'],\n",
    "       'empty': ['want', 'try', 'trying'],\n",
    "       'enthusiasm': ['eager', 'loved', 'happy'],\n",
    "       'fun': ['doomed', 'brain', 'normal'],\n",
    "       'happiness': ['cool', 'content', 'contented'],\n",
    "       'hate': ['towards', 'hate', 'today'],\n",
    "       'love': ['love', 'toward', 'hated'],\n",
    "       'neutral': ['invigorated', 'valuable', 'shaky'],\n",
    "       'relief': ['comfort', 'uncomfortable', 'pain'],\n",
    "       'sadness': ['worry', 'excited', 'sadness'],\n",
    "       'surprise': ['unpleasant', 'joyful', 'shocked'],\n",
    "       'worry': ['worried', 'issues', 'questions']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7c7a20d0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:13.370388Z",
     "iopub.status.busy": "2024-06-08T05:23:13.369555Z",
     "iopub.status.idle": "2024-06-08T05:23:17.017386Z",
     "shell.execute_reply": "2024-06-08T05:23:17.016139Z"
    },
    "papermill": {
     "duration": 3.663146,
     "end_time": "2024-06-08T05:23:17.019737",
     "exception": false,
     "start_time": "2024-06-08T05:23:13.356591",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.7027532442782188\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       anger       0.02      0.04      0.03      2461\n",
      "     boredom       0.00      0.13      0.00        30\n",
      "       empty       0.01      0.12      0.02      1153\n",
      "  enthusiasm       0.10      0.29      0.15      1877\n",
      "         fun       0.02      0.01      0.02      2055\n",
      "   happiness       0.59      0.22      0.32      5364\n",
      "        hate       0.40      0.79      0.53      3012\n",
      "        love       0.91      0.49      0.64      8045\n",
      "     neutral       0.92      0.80      0.85    134788\n",
      "      relief       0.55      0.47      0.51      3373\n",
      "     sadness       0.16      0.06      0.09      3535\n",
      "    surprise       0.39      0.34      0.36      1356\n",
      "       worry       0.02      0.02      0.02       862\n",
      "\n",
      "    accuracy                           0.70    167911\n",
      "   macro avg       0.31      0.29      0.27    167911\n",
      "weighted avg       0.83      0.70      0.75    167911\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 预测\n",
    "predicted_emotions = test_texts.apply(lambda x: predict_emotion(x))\n",
    "test_labels_encoded = label_encoder.transform(test_labels)\n",
    "predicted_emotions_encoded = label_encoder.transform(predicted_emotions)\n",
    "accuracy = accuracy_score(test_labels_encoded, predicted_emotions_encoded)\n",
    "report = classification_report(test_labels_encoded, predicted_emotions_encoded, target_names=label_encoder.classes_)\n",
    "\n",
    "print('Accuracy:', accuracy)\n",
    "print('Classification Report:')\n",
    "print(report)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3d410b88",
   "metadata": {
    "papermill": {
     "duration": 0.011311,
     "end_time": "2024-06-08T05:23:17.042637",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.031326",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Sima Anjali 的词汇表通过观察数据容易得到, 确实非常巧妙, 原文[链接](https://www.kaggle.com/code/simaanjali/sentiment-analysis) ,但是我希望用一种更通用的方式获得top3_words_per_class, 下面是我的一些尝试, 最后翻车了..."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e86265bd",
   "metadata": {
    "papermill": {
     "duration": 0.011214,
     "end_time": "2024-06-08T05:23:17.065375",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.054161",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 数据预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1c71821",
   "metadata": {
    "papermill": {
     "duration": 0.011364,
     "end_time": "2024-06-08T05:23:17.088406",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.077042",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### 去掉'Unnamed:0'列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cdff5bf0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:17.113369Z",
     "iopub.status.busy": "2024-06-08T05:23:17.113012Z",
     "iopub.status.idle": "2024-06-08T05:23:17.122983Z",
     "shell.execute_reply": "2024-06-08T05:23:17.121993Z"
    },
    "papermill": {
     "duration": 0.024995,
     "end_time": "2024-06-08T05:23:17.125067",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.100072",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'i seriously hate one subject to death but now i feel reluctant to drop it'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if 'Unnamed: 0' in df.columns:\n",
    "    del df['Unnamed: 0']\n",
    "df.loc[0]['text']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "61ae191d",
   "metadata": {
    "papermill": {
     "duration": 0.011624,
     "end_time": "2024-06-08T05:23:17.148755",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.137131",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### 检查缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "414114e3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:17.174199Z",
     "iopub.status.busy": "2024-06-08T05:23:17.173501Z",
     "iopub.status.idle": "2024-06-08T05:23:17.358895Z",
     "shell.execute_reply": "2024-06-08T05:23:17.357895Z"
    },
    "papermill": {
     "duration": 0.200364,
     "end_time": "2024-06-08T05:23:17.361025",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.160661",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "text       False\n",
       "Emotion    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().any()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac15176e",
   "metadata": {
    "papermill": {
     "duration": 0.011211,
     "end_time": "2024-06-08T05:23:17.384265",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.373054",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "没有缺失值"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6821ea4e",
   "metadata": {
    "papermill": {
     "duration": 0.011459,
     "end_time": "2024-06-08T05:23:17.407497",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.396038",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### 统计标签个数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d82f1289",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:17.432218Z",
     "iopub.status.busy": "2024-06-08T05:23:17.431428Z",
     "iopub.status.idle": "2024-06-08T05:23:17.572340Z",
     "shell.execute_reply": "2024-06-08T05:23:17.571339Z"
    },
    "papermill": {
     "duration": 0.15559,
     "end_time": "2024-06-08T05:23:17.574515",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.418925",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Emotion\n",
       "neutral       674538\n",
       "love           39553\n",
       "happiness      27175\n",
       "sadness        17481\n",
       "relief         16729\n",
       "hate           15267\n",
       "anger          12336\n",
       "fun            10075\n",
       "enthusiasm      9304\n",
       "surprise        6954\n",
       "empty           5542\n",
       "worry           4475\n",
       "boredom          126\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Emotion'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c6dda6f",
   "metadata": {
    "papermill": {
     "duration": 0.010949,
     "end_time": "2024-06-08T05:23:17.597138",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.586189",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "neutral情绪居多"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b4e35fa",
   "metadata": {
    "papermill": {
     "duration": 0.011264,
     "end_time": "2024-06-08T05:23:17.619973",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.608709",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### 过滤HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b81a048f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:17.644904Z",
     "iopub.status.busy": "2024-06-08T05:23:17.644515Z",
     "iopub.status.idle": "2024-06-08T05:23:17.649773Z",
     "shell.execute_reply": "2024-06-08T05:23:17.648770Z"
    },
    "papermill": {
     "duration": 0.019875,
     "end_time": "2024-06-08T05:23:17.651971",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.632096",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def noiseremovel_text(text):\n",
    "    soup = BeautifulSoup(text, \"html.parser\")\n",
    "    text = soup.get_text()\n",
    "    text = re.sub(r'\\[[^]]*\\]', '', text)    \n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "898231fd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:17.676917Z",
     "iopub.status.busy": "2024-06-08T05:23:17.676337Z",
     "iopub.status.idle": "2024-06-08T05:23:17.683051Z",
     "shell.execute_reply": "2024-06-08T05:23:17.682233Z"
    },
    "papermill": {
     "duration": 0.021283,
     "end_time": "2024-06-08T05:23:17.684999",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.663716",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'I really enjoyed the latest episode of my favorite show!  Check out this link for a recap. #bestshowever '"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_text = '<div>I really enjoyed the latest episode of my favorite show! [https://t.co/xyz123] Check out this link for a recap. #bestshowever [Ad: Stream now on MyStreamingService for 50% off!]</div>'\n",
    "trans_sample_text = noiseremovel_text(sample_text)\n",
    "trans_sample_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0d72d5c9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:23:17.709551Z",
     "iopub.status.busy": "2024-06-08T05:23:17.709183Z",
     "iopub.status.idle": "2024-06-08T05:24:03.532030Z",
     "shell.execute_reply": "2024-06-08T05:24:03.530933Z"
    },
    "papermill": {
     "duration": 45.837974,
     "end_time": "2024-06-08T05:24:03.534723",
     "exception": false,
     "start_time": "2024-06-08T05:23:17.696749",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df['text'] = df['text'].apply(noiseremovel_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf25f79c",
   "metadata": {
    "papermill": {
     "duration": 0.011823,
     "end_time": "2024-06-08T05:24:03.559258",
     "exception": false,
     "start_time": "2024-06-08T05:24:03.547435",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### 移除stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "9d38274f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:24:03.585450Z",
     "iopub.status.busy": "2024-06-08T05:24:03.585072Z",
     "iopub.status.idle": "2024-06-08T05:24:03.676534Z",
     "shell.execute_reply": "2024-06-08T05:24:03.675293Z"
    },
    "papermill": {
     "duration": 0.107285,
     "end_time": "2024-06-08T05:24:03.679106",
     "exception": false,
     "start_time": "2024-06-08T05:24:03.571821",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "nltk.download('stopwords')\n",
    "stopwords = nltk.corpus.stopwords.words('english')\n",
    "stop_wr = set(stopwords)\n",
    "\n",
    "def remove_stopwords(text, stop_words):\n",
    "   tokenizers = ToktokTokenizer()\n",
    "   #提取单词和缩写\n",
    "   words = re.findall(r'\\w+|\\.\\.+', text)\n",
    "   stop_words = set(stop_words)\n",
    "   filtokens = [i for i in words if i.lower() not in stop_words]\n",
    "   # 连接\n",
    "   filtered_text = ' '.join(filtokens)\n",
    "   return filtered_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "57c9cf73",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:24:03.705606Z",
     "iopub.status.busy": "2024-06-08T05:24:03.705244Z",
     "iopub.status.idle": "2024-06-08T05:24:03.711894Z",
     "shell.execute_reply": "2024-06-08T05:24:03.710852Z"
    },
    "papermill": {
     "duration": 0.023006,
     "end_time": "2024-06-08T05:24:03.714544",
     "exception": false,
     "start_time": "2024-06-08T05:24:03.691538",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'seriously hate one subject death feel reluctant drop'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_text = 'i seriously hate one subject to death but now i feel reluctant to drop it'\n",
    "trans_sample_text = remove_stopwords(sample_text, stop_wr)\n",
    "trans_sample_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b17dda19",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:24:03.739617Z",
     "iopub.status.busy": "2024-06-08T05:24:03.739057Z",
     "iopub.status.idle": "2024-06-08T05:24:19.365883Z",
     "shell.execute_reply": "2024-06-08T05:24:19.365027Z"
    },
    "papermill": {
     "duration": 15.64211,
     "end_time": "2024-06-08T05:24:19.368550",
     "exception": false,
     "start_time": "2024-06-08T05:24:03.726440",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df['text'] = df['text'].apply(remove_stopwords, stop_words=stop_wr)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44f136c5",
   "metadata": {
    "papermill": {
     "duration": 0.012191,
     "end_time": "2024-06-08T05:24:19.393559",
     "exception": false,
     "start_time": "2024-06-08T05:24:19.381368",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 获取每个类别的最重要的关键词top10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "64b126b5",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:24:19.419737Z",
     "iopub.status.busy": "2024-06-08T05:24:19.419349Z",
     "iopub.status.idle": "2024-06-08T05:27:35.741652Z",
     "shell.execute_reply": "2024-06-08T05:27:35.740529Z"
    },
    "papermill": {
     "duration": 196.369291,
     "end_time": "2024-06-08T05:27:35.774860",
     "exception": false,
     "start_time": "2024-06-08T05:24:19.405569",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression(max_iter=1000)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(max_iter=1000)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LogisticRegression(max_iter=1000)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#使用TF-IDF进行文本向量化\n",
    "vectorizer = TfidfVectorizer(max_features=1000)\n",
    "X_tfidf = vectorizer.fit_transform(df['text'])\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_tfidf, emotion_encoded, test_size=0.2, random_state=42)\n",
    "\n",
    "model = LogisticRegression(max_iter=1000)\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "20e50fb7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:27:35.832932Z",
     "iopub.status.busy": "2024-06-08T05:27:35.832299Z",
     "iopub.status.idle": "2024-06-08T05:27:36.137172Z",
     "shell.execute_reply": "2024-06-08T05:27:36.136281Z"
    },
    "papermill": {
     "duration": 0.337164,
     "end_time": "2024-06-08T05:27:36.139818",
     "exception": false,
     "start_time": "2024-06-08T05:27:35.802654",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9709548510818231\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.97      0.64      0.77      2489\n",
      "           1       0.00      0.00      0.00        21\n",
      "           2       1.00      0.62      0.76      1096\n",
      "           3       1.00      0.96      0.98      1839\n",
      "           4       0.98      0.84      0.91      1977\n",
      "           5       0.98      0.89      0.94      5370\n",
      "           6       0.98      0.90      0.94      3018\n",
      "           7       0.99      0.93      0.96      8001\n",
      "           8       0.97      1.00      0.98    134999\n",
      "           9       0.98      0.73      0.84      3396\n",
      "          10       1.00      0.92      0.96      3428\n",
      "          11       0.99      0.86      0.92      1372\n",
      "          12       0.99      0.57      0.73       905\n",
      "\n",
      "    accuracy                           0.97    167911\n",
      "   macro avg       0.91      0.76      0.82    167911\n",
      "weighted avg       0.97      0.97      0.97    167911\n",
      "\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(X_test)\n",
    "print('Accuracy:', accuracy_score(y_test, y_pred))\n",
    "\n",
    "print(\"Classification Report:\")\n",
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "94c44c84",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:27:36.167246Z",
     "iopub.status.busy": "2024-06-08T05:27:36.166470Z",
     "iopub.status.idle": "2024-06-08T05:27:36.175612Z",
     "shell.execute_reply": "2024-06-08T05:27:36.174381Z"
    },
    "papermill": {
     "duration": 0.025314,
     "end_time": "2024-06-08T05:27:36.177947",
     "exception": false,
     "start_time": "2024-06-08T05:27:36.152633",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "anger: angry, discouraged, feel, gives, upon, happening, support, anger, strong, word\n",
      "boredom: tortured, dull, dissatisfied, productive, ignored, uncertain, read, work, anything, blank\n",
      "empty: empty, sad, sadness, love, try, trying, want, aching, tend, depressed\n",
      "enthusiasm: excited, eager, love, loved, happy, passionate, peaceful, surprised, appreciative, hopeful\n",
      "fun: funny, fun, joyful, doomed, joy, brain, opportunity, happy, normal, money\n",
      "happiness: happy, enjoy, unhappy, content, discontent, joy, contented, cool, useful, piece\n",
      "hate: hate, whatever, hated, towards, comments, angry, feel, today, person, toward\n",
      "love: love, loved, beloved, lovely, unloved, toward, towards, hate, hated, happy\n",
      "neutral: adventurous, blessed, bitchy, restless, rude, valuable, overwhelmed, shaky, invigorated, horny\n",
      "relief: comfortable, uncomfortable, pleased, please, comfort, sweet, sense, pain, relaxed, restless\n",
      "sadness: sad, depressed, melancholy, sadness, love, shocked, excited, deeply, worry, feel\n",
      "surprise: surprised, shocked, love, pleasant, lovely, loved, unpleasant, party, joyful, generous\n",
      "worry: worry, anxiety, love, far, worried, issues, selfish, petty, questions, caring\n"
     ]
    }
   ],
   "source": [
    "# 获取每个类别的最重要的关键词\n",
    "feature_names = vectorizer.get_feature_names_out()\n",
    "for i, class_label in enumerate(label_encoder.classes_):\n",
    "    tops = model.coef_[i].argsort()[-10:][::-1]\n",
    "    print(f\"{class_label}: {', '.join([feature_names[idx] for idx in tops])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09b2a27e",
   "metadata": {
    "papermill": {
     "duration": 0.01225,
     "end_time": "2024-06-08T05:27:36.203284",
     "exception": false,
     "start_time": "2024-06-08T05:27:36.191034",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## 运用卡方检测筛选top3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a943a202",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:27:36.230609Z",
     "iopub.status.busy": "2024-06-08T05:27:36.229925Z",
     "iopub.status.idle": "2024-06-08T05:27:36.237886Z",
     "shell.execute_reply": "2024-06-08T05:27:36.237062Z"
    },
    "papermill": {
     "duration": 0.023883,
     "end_time": "2024-06-08T05:27:36.239991",
     "exception": false,
     "start_time": "2024-06-08T05:27:36.216108",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 获取每个类别的最重要的关键词\n",
    "feature_names = vectorizer.get_feature_names_out()\n",
    "top_words = {}\n",
    "for i, class_label in enumerate(label_encoder.classes_):\n",
    "    tops = model.coef_[i].argsort()[-10:][::-1]\n",
    "    top_words[class_label] = [feature_names[j] for j in tops]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "8372df21",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:27:36.268429Z",
     "iopub.status.busy": "2024-06-08T05:27:36.268047Z",
     "iopub.status.idle": "2024-06-08T05:28:35.516736Z",
     "shell.execute_reply": "2024-06-08T05:28:35.515692Z"
    },
    "papermill": {
     "duration": 59.266701,
     "end_time": "2024-06-08T05:28:35.519498",
     "exception": false,
     "start_time": "2024-06-08T05:27:36.252797",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 合并所有情绪类别的top10词汇\n",
    "all_top_words = set(word for words in top_words.values() for word in words)\n",
    "\n",
    "split = int(len(df['text'])*0.8)\n",
    "\n",
    "# 使用这些词作为特征进行向量化\n",
    "vectorizer = CountVectorizer(vocabulary=all_top_words)\n",
    "X = vectorizer.fit_transform(df['text'][:split])\n",
    "features = vectorizer.get_feature_names_out()\n",
    "\n",
    "# 创建频率矩阵\n",
    "word_counts = np.zeros((len(label_encoder.classes_), len(features)))\n",
    "for i, text in enumerate(X):\n",
    "    emotion_index = y_train[i] if i < len(y_train) else y_test[i - len(y_train)]\n",
    "    word_counts[emotion_index] += text.toarray()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d20b693d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:28:35.547702Z",
     "iopub.status.busy": "2024-06-08T05:28:35.547307Z",
     "iopub.status.idle": "2024-06-08T05:28:35.595153Z",
     "shell.execute_reply": "2024-06-08T05:28:35.594241Z"
    },
    "papermill": {
     "duration": 0.064429,
     "end_time": "2024-06-08T05:28:35.597579",
     "exception": false,
     "start_time": "2024-06-08T05:28:35.533150",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 进行卡方检验\n",
    "chi2_results = []\n",
    "for i, word in enumerate(features):\n",
    "    # 构建一个2x2的表格来计算每个词的卡方值\n",
    "    contingency_table = np.array([word_counts[:, i], np.sum(word_counts, axis=1) - word_counts[:, i]]).T\n",
    "    chi2, p, dof, ex = chi2_contingency(contingency_table)\n",
    "    chi2_results.append((word, chi2, p))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "aaa08844",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:28:35.626231Z",
     "iopub.status.busy": "2024-06-08T05:28:35.625871Z",
     "iopub.status.idle": "2024-06-08T05:28:35.639429Z",
     "shell.execute_reply": "2024-06-08T05:28:35.638498Z"
    },
    "papermill": {
     "duration": 0.031212,
     "end_time": "2024-06-08T05:28:35.642109",
     "exception": false,
     "start_time": "2024-06-08T05:28:35.610897",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           Word  Chi2 Statistic   p-value\n",
      "26        eager       25.034678  0.014659\n",
      "18         cool       24.842598  0.015587\n",
      "24       doomed       24.435655  0.017736\n",
      "11        brain       23.400643  0.024511\n",
      "43  invigorated       22.573347  0.031574\n",
      "..          ...             ...       ...\n",
      "9         blank        4.887556  0.961629\n",
      "61        piece        4.496386  0.972726\n",
      "80        sweet        4.249451  0.978503\n",
      "35        gives        3.867413  0.985712\n",
      "1   adventurous        2.220789  0.998985\n",
      "\n",
      "[102 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "# 输出结果\n",
    "chi2_df = pd.DataFrame(chi2_results, columns=['Word', 'Chi2 Statistic', 'p-value'])\n",
    "print(chi2_df.sort_values(by='p-value'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "f4b13938",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:28:35.671275Z",
     "iopub.status.busy": "2024-06-08T05:28:35.670897Z",
     "iopub.status.idle": "2024-06-08T05:28:35.689349Z",
     "shell.execute_reply": "2024-06-08T05:28:35.688498Z"
    },
    "papermill": {
     "duration": 0.036,
     "end_time": "2024-06-08T05:28:35.691802",
     "exception": false,
     "start_time": "2024-06-08T05:28:35.655802",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 删除重复关键词并获取每个标签的top3关键词\n",
    "selected_words = set()\n",
    "top3_words_per_class = {}\n",
    "for class_label in label_encoder.classes_:\n",
    "    class_df = chi2_df[chi2_df['Word'].isin(top_words[class_label])]\n",
    "    class_df = class_df.sort_values(by='p-value')\n",
    "    top3_words = []\n",
    "    for word in class_df['Word']:\n",
    "        if word not in selected_words:\n",
    "            top3_words.append(word)\n",
    "            selected_words.add(word)\n",
    "        if len(top3_words) == 3:\n",
    "            break\n",
    "    top3_words_per_class[class_label] = top3_words\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "1569abf8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:28:35.721410Z",
     "iopub.status.busy": "2024-06-08T05:28:35.721021Z",
     "iopub.status.idle": "2024-06-08T05:28:35.726801Z",
     "shell.execute_reply": "2024-06-08T05:28:35.725762Z"
    },
    "papermill": {
     "duration": 0.02376,
     "end_time": "2024-06-08T05:28:35.729446",
     "exception": false,
     "start_time": "2024-06-08T05:28:35.705686",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "anger: word, strong, anger\n",
      "boredom: work, tortured, read\n",
      "empty: want, try, trying\n",
      "enthusiasm: eager, loved, happy\n",
      "fun: doomed, brain, normal\n",
      "happiness: cool, content, contented\n",
      "hate: towards, hate, today\n",
      "love: love, toward, hated\n",
      "neutral: invigorated, valuable, shaky\n",
      "relief: comfort, uncomfortable, pain\n",
      "sadness: worry, excited, sadness\n",
      "surprise: unpleasant, joyful, shocked\n",
      "worry: worried, issues, questions\n"
     ]
    }
   ],
   "source": [
    "# 输出结果\n",
    "for class_label, words in top3_words_per_class.items():\n",
    "    print(f\"{class_label}: {', '.join(words)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3713ea76",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-06-08T05:28:35.758343Z",
     "iopub.status.busy": "2024-06-08T05:28:35.757518Z",
     "iopub.status.idle": "2024-06-08T05:28:35.765106Z",
     "shell.execute_reply": "2024-06-08T05:28:35.764191Z"
    },
    "papermill": {
     "duration": 0.024201,
     "end_time": "2024-06-08T05:28:35.767261",
     "exception": false,
     "start_time": "2024-06-08T05:28:35.743060",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'anger': ['word', 'strong', 'anger'],\n",
       " 'boredom': ['work', 'tortured', 'read'],\n",
       " 'empty': ['want', 'try', 'trying'],\n",
       " 'enthusiasm': ['eager', 'loved', 'happy'],\n",
       " 'fun': ['doomed', 'brain', 'normal'],\n",
       " 'happiness': ['cool', 'content', 'contented'],\n",
       " 'hate': ['towards', 'hate', 'today'],\n",
       " 'love': ['love', 'toward', 'hated'],\n",
       " 'neutral': ['invigorated', 'valuable', 'shaky'],\n",
       " 'relief': ['comfort', 'uncomfortable', 'pain'],\n",
       " 'sadness': ['worry', 'excited', 'sadness'],\n",
       " 'surprise': ['unpleasant', 'joyful', 'shocked'],\n",
       " 'worry': ['worried', 'issues', 'questions']}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top3_words_per_class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98ed9e0d",
   "metadata": {
    "papermill": {
     "duration": 0.013978,
     "end_time": "2024-06-08T05:28:35.856734",
     "exception": false,
     "start_time": "2024-06-08T05:28:35.842756",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "none",
   "dataSources": [
    {
     "datasetId": 4540583,
     "sourceId": 7763359,
     "sourceType": "datasetVersion"
    }
   ],
   "dockerImageVersionId": 30732,
   "isGpuEnabled": false,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 336.058127,
   "end_time": "2024-06-08T05:28:36.592538",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2024-06-08T05:23:00.534411",
   "version": "2.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}