Spaces:

DanilO0o
/

nlp_group_project

Sleeping

App Files Files Community

DanilO0o commited on Nov 29, 2024

Commit

edcd390

1 Parent(s): e1607f1

added new model

Browse files

Files changed (10) hide show

rugpt.py → app.py +0 -0
models/Sasha_best_lstm_model3.pth +3 -0
models/Sasha_best_model_bert.pth +3 -0
models/Sasha_logistic_model2.pkl +3 -0
models/bert_classifier.py +27 -0
models/lstm_attention.py +29 -0
models/text_preprocessor.py +55 -0
models/vectorizer.pkl +3 -0
pages/sasha_main_page_final.py +205 -0
requirements.txt +66 -0

rugpt.py → app.py RENAMED Viewed

File without changes

models/Sasha_best_lstm_model3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebd4ef0cd62eb779c9f9b0dcf90bc23d63ec46884c035884a44258a5763ec1c6
+size 46108066

models/Sasha_best_model_bert.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39d23a0fa07df356ffb6207961cab88d4efaab0b9d2a3fad4ad620f3f89a73bd
+size 117128865

models/Sasha_logistic_model2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2467bdf90ee233cd38e6b3336fc5746dc6fd848d0ca1a53b002da85485366ac1
+size 401015

models/bert_classifier.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+from torch import nn
+from transformers import AutoModel
+class MyTinyBERT(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+        for param in self.bert.parameters():
+            param.requires_grad = True
+        # Разморозка последних слоёв
+        for name, param in self.bert.named_parameters():
+            if any(layer in name for layer in ['layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']):
+                param.requires_grad = True
+        self.linear = nn.Sequential(
+            nn.Linear(312, 256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, 10))  # Для 10 классов
+    def forward(self, input_dict):
+        # Ожидается словарь с ключами "input_ids" и "attention_mask"
+        bert_out = self.bert(**input_dict)
+        # Используем скрытое состояние для [CLS] токена
+        normed_bert_out = nn.functional.normalize(bert_out.last_hidden_state[:, 0, :])
+        return self.linear(normed_bert_out)

models/lstm_attention.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from torch import nn
+class LSTMAttention(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
+        super(LSTMAttention, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.attention = nn.Linear(hidden_dim * 2, 1)
+        self.fc = nn.Linear(hidden_dim * 2, output_dim)
+        self.dropout = nn.Dropout(0.5)
+    def forward(self, input_ids):
+        # Embedding слой
+        embedded = self.embedding(input_ids)  # (batch_size, seq_len, embedding_dim)
+        # LSTM слой
+        lstm_out, _ = self.lstm(embedded)  # (batch_size, seq_len, hidden_dim*2)
+        # Механизм внимания
+        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)  # (batch_size, seq_len, 1)
+        # Вектор контекста
+        context_vector = torch.sum(attn_weights * lstm_out, dim=1)  # (batch_size, hidden_dim*2)
+        # Классификатор
+        output = self.fc(self.dropout(context_vector))  # (batch_size, output_dim)
+        return output, attn_weights.squeeze(-1)

models/text_preprocessor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#text_preprocessor.py
+import pandas as pd
+import re
+import string
+import pymorphy3
+from sklearn.base import BaseEstimator, TransformerMixin
+class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.stop_words = self.get_stopwords_list()
+        self.morph = pymorphy3.MorphAnalyzer()
+    def fit(self, X, y=None):
+        return self
+    def transform(self, texts, y=None, lemmatize=True):
+        return [self.preprocess(text, lemmatize=lemmatize) for text in texts]
+    def get_stopwords_list(self):
+        url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"
+        stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords'])
+        return stopwords_cust
+    def clean(self, text):
+        text = text.lower()
+        text = re.sub(r'http\S+', " ", text)
+        text = re.sub(r'@\w+', ' ', text)
+        text = re.sub(r'#\w+', ' ', text)
+        text = re.sub(r'\d+', ' ', text)
+        text = re.sub(r'[^\w\s,]', '', text)
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        text = re.sub(r'<.*?>', ' ', text)
+        text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text)
+        text = re.sub(r'[a-zA-Z]', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    def remove_stopwords(self, text):
+        return ' '.join([word for word in text.split() if word not in self.stop_words])
+    def lemmatize(self, text):
+        morph = self.morph
+        lemmatized_text = ''
+        for word in text.split():
+            lemmatized_text += morph.parse(word)[0].normal_form + " "
+        return lemmatized_text
+    def preprocess(self, text, lemmatize=True):
+        """Общая функция обработки текста с возможностью отключить лемматизацию"""
+        text = self.clean(text)
+        text = self.remove_stopwords(text)
+        if lemmatize:
+            text = self.lemmatize(text)
+        return text

models/vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad82a7fee20e9a6f702eb77245b35260e61b5cf4931616a3872da2ce3cb352ff
+size 238499

pages/sasha_main_page_final.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from models.bert_classifier import MyTinyBERT
+from models.lstm_attention import LSTMAttention
+from models.text_preprocessor import MyCustomTextPreprocessor
+import streamlit as st
+from sklearn.utils.class_weight import compute_class_weight
+import torch.nn.functional as F
+import torch.optim as optim
+import joblib
+from torch import nn
+from sklearn.base import BaseEstimator, TransformerMixin
+from transformers import AutoTokenizer, AutoModel
+from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, TensorDataset
+from time import time
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pymorphy3
+import string
+import re
+import pandas as pd
+import numpy as np
+import torch
+import sklearn
+import matplotlib.pyplot as plt
+import warnings
+warnings.simplefilter("ignore")
+# Metrics
+# custom
+# ======= Глобальная инициализация токенизатора =======
+tokenizer = AutoTokenizer.from_pretrained(
+    "cointegrated/rubert-tiny2")  # Для LSTM и BERT
+# ======= Инициализация обработчика текста =======
+preprocessor = MyCustomTextPreprocessor()
+# ======= Загрузка моделей и векторизатора =======
+# @st.cache_resource
+def load_resources():
+    # Загрузка TF-IDF векторизатора
+    vectorizer = joblib.load('models/vectorizer.pkl')  # TF-IDF
+    # Загрузка модели логистической регрессии
+    # Логистическая регрессия
+    model1 = joblib.load('models/Sasha_logistic_model2.pkl')
+    # Настройка модели LSTM
+    # Используем уже загруженный токенизатор
+    VOCAB_SIZE = len(tokenizer.get_vocab())
+    EMBEDDING_DIM = 128
+    HIDDEN_DIM = 256
+    OUTPUT_DIM = 10
+    model2 = LSTMAttention(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
+    model2.load_state_dict(torch.load(
+        'models/Sasha_best_lstm_model3.pth', map_location=torch.device('cpu')))
+    model2.eval()
+    # Настройка модели BERT
+    model3 = MyTinyBERT()
+    model3.load_state_dict(torch.load(
+        'models/Sasha_best_model_bert.pth', map_location=torch.device('cpu')))
+    model3.eval()
+    return model1, model2, model3, vectorizer
+# Загружаем ресурсы
+model1, model2, model3, vectorizer = load_resources()
+# ======= Предобработка текста =======
+def preprocess_for_model1(text):
+    """TF-IDF векторизация для логистической регрессии"""
+    processed_text = preprocessor.preprocess(
+        text, lemmatize=True)  # Лемматизация включена
+    return vectorizer.transform([processed_text])
+def preprocess_for_model2_and_model3(text):
+    """Общая обработка для LSTM и BERT моделей (без лемматизации)"""
+    processed_text = preprocessor.preprocess(
+        text, lemmatize=False)  # Лемматизация выключена
+    return processed_text
+def preprocess_for_model2(text, tokenizer):
+    """Токенизация для LSTM модели"""
+    processed_text = preprocess_for_model2_and_model3(text)
+    tokenized_data = tokenizer(
+        [processed_text],
+        padding=True,
+        truncation=True,
+        return_tensors="pt",
+        max_length=256
+    )
+    return tokenized_data["input_ids"], tokenized_data["attention_mask"]
+def preprocess_for_model3(text, tokenizer):
+    """Токенизация для BERT модели"""
+    processed_text = preprocess_for_model2_and_model3(text)
+    tokenized_data = tokenizer(
+        [processed_text],
+        padding=True,
+        truncation=True,
+        return_tensors="pt",
+        max_length=256
+    )
+    return tokenized_data
+# ======= Прогноз и визуализация =======
+def predict_and_visualize(text):
+    # ======= Модель 1 (Logistic Regression) =======
+    start_time = time()  # Начало времени предсказания
+    vectorized_text = preprocess_for_model1(text)
+    probs1 = model1.predict_proba(vectorized_text)[0]
+    model1_time = time() - start_time  # Рассчитываем время предсказания для модели 1
+    # ======= Модель 2 (LSTM & Attention) =======
+    start_time = time()  # Начало времени предсказания
+    input_ids, _ = preprocess_for_model2(
+        text, tokenizer)  # Получаем только input_ids
+    with torch.no_grad():
+        logits2, attn_weights = model2(input_ids)  # Передаём только input_ids
+        probs2 = torch.softmax(logits2, dim=1).numpy()[0]
+        attention_vector = attn_weights.cpu().numpy()[0]
+    model2_time = time() - start_time  # Рассчитываем время предсказ��ния для модели 2
+    # ======= Модель 3 (BERT) =======
+    start_time = time()  # Начало времени предсказания
+    tokenized_text = preprocess_for_model3(text, tokenizer)
+    with torch.no_grad():
+        logits3 = model3(tokenized_text)
+        probs3 = torch.softmax(logits3, dim=1).numpy()[0]
+    model3_time = time() - start_time  # Рассчитываем время предсказания для модели 3
+    # ======= Финальное предсказание =======
+    final_probs = (probs1 + probs2 + probs3) / 3
+    final_class = np.argmax(final_probs)
+    # ======= Визуализация =======
+    st.subheader("Распределение вероятностей")
+    for probs, model_name in zip([probs1, probs2, probs3], ['Model 1 (Logistic Regression)', 'Model 2 (LSTM)', 'Model 3 (BERT)']):
+        fig, ax = plt.subplots()
+        ax.bar(range(1, len(probs) + 1), probs)  # Сдвиг индекса на +1
+        ax.set_title(f'{model_name} Probabilities')
+        ax.set_xlabel('Class (1-10)')
+        ax.set_ylabel('Probability')
+        st.pyplot(fig)
+    # ======= Визуализация внимания (LSTM) =======
+    st.subheader("Веса внимания (LSTM)")
+    # Проверяем наличие attention weights
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
+    tokens = tokens[:len(attention_vector)]
+    attention_vector = attention_vector[:len(tokens)]
+    fig, ax = plt.subplots(figsize=(12, 6))
+    ax.bar(range(len(tokens)), attention_vector, align="center")
+    ax.set_xticks(range(len(tokens)))
+    ax.set_xticklabels(tokens, rotation=45, ha="right")
+    ax.set_title("Attention Weights (LSTM)")
+    ax.set_xlabel("Токены")
+    ax.set_ylabel("Вес внимания")
+    st.pyplot(fig)
+    # Итоговое предсказание
+    st.subheader("Итоговое предсказание")
+    # Смещение на +1
+    st.write(f"Наиболее вероятный класс: **{final_class + 1}**")
+    # Вывод времени выполнения
+    st.subheader("Время выполнения моделей")
+    st.write(f"Модель 1 (Logistic Regression): {model1_time:.4f} секунд")
+    st.write(f"Модель 2 (LSTM): {model2_time:.4f} секунд")
+    st.write(f"Модель 3 (BERT): {model3_time:.4f} секунд")
+    return final_class
+# ======= Streamlit UI =======
+st.title("Классификация текстов с 3 моделями")
+st.write("Введите текст отзыва, чтобы получить результаты классификации от трёх моделей.")
+# Ввод текста пользователем
+user_input = st.text_area("Введите текст отзыва:", "")
+if st.button("Классифицировать"):
+    if user_input.strip():
+        # Прогноз и визуализация
+        predict_and_visualize(user_input)
+    else:
+        st.warning("Введите текст для анализа.")
+st.subheader("F1 macro,  валидационная выборка")
+st.write(f'f1 macro valid logreg=0.2516')
+st.write(f'f1 macro valid lstm=0.2515')
+st.write(f'f1 macro valid bert=0.2709')

requirements.txt CHANGED Viewed

@@ -1,3 +1,69 @@
 streamlit
 torch
 transformers

 streamlit
 torch
 transformers
+altair==5.5.0
+attrs==24.2.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+contourpy==1.3.1
+cycler==0.12.1
+DAWG-Python==0.7.2
+filelock
+fonttools==4.55.0
+fsspec
+gitdb==4.0.11
+GitPython==3.1.43
+gmpy2
+huggingface-hub==0.26.3
+idna==3.10
+Jinja2
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath
+narwhals==1.14.3
+networkx
+numpy
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+protobuf==5.29.0
+pyarrow==18.1.0
+pydeck==0.9.1
+Pygments==2.18.0
+pymorphy3==2.0.2
+pymorphy3-dicts-ru==2.4.417150.4580142
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.21.0
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+sentencepiece==0.2.0
+six==1.16.0
+smmap==5.0.1
+sympy
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+toml==0.10.2
+tornado==6.4.2
+tqdm==4.67.1
+typing_extensions
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0