Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import re
|
3 |
+
import pandas as pd
|
4 |
+
import pymorphy2
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
import language_tool_python
|
7 |
+
import torch
|
8 |
+
from transformers import AutoTokenizer, AutoModel
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from tqdm import tqdm
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
|
15 |
+
# Загрузка стоп-слов для русского языка
|
16 |
+
nltk.download('stopwords')
|
17 |
+
stop_words = set(stopwords.words('russian'))
|
18 |
+
|
19 |
+
# Инициализация морфологического анализатора и корректора текста
|
20 |
+
morph = pymorphy2.MorphAnalyzer()
|
21 |
+
tool = language_tool_python.LanguageTool('ru')
|
22 |
+
|
23 |
+
# Инициализация модели BERT
|
24 |
+
bert_model_name = "sberbank-ai/sbert_large_nlu_ru"
|
25 |
+
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
|
26 |
+
model = AutoModel.from_pretrained(bert_model_name)
|
27 |
+
|
28 |
+
def preprocess_text(text):
|
29 |
+
"""Функция для предобработки текста."""
|
30 |
+
if not isinstance(text, str):
|
31 |
+
return ""
|
32 |
+
|
33 |
+
# Очистка текста
|
34 |
+
text = text.lower().strip()
|
35 |
+
text = re.sub(r'[^\w\s]', ' ', text)
|
36 |
+
text = re.sub(r'-', ' ', text)
|
37 |
+
text = re.sub(r'[a-zA-Z0-9]', ' ', text)
|
38 |
+
text = generate(text)
|
39 |
+
text = correct_text(text)
|
40 |
+
return text
|
41 |
+
|
42 |
+
def get_embedding(text):
|
43 |
+
|
44 |
+
"""Получение эмбеддингов BERT для текста."""
|
45 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
46 |
+
with torch.no_grad():
|
47 |
+
outputs = model(**inputs)
|
48 |
+
# Берем среднее значение скрытых состояний по всем токенам
|
49 |
+
return outputs.last_hidden_state.mean(dim=1).squeeze()
|
50 |
+
def find_best_matches(profession, nkz_list, embeddings_nkz, top_n=10):
|
51 |
+
"""Находит топ-N наилучших соответствий для профессии в списке НКЗ."""
|
52 |
+
# Предобработка профессии
|
53 |
+
processed_profession = preprocess_text(profession)
|
54 |
+
print(f"Предобработанная профессия: {processed_profession}")
|
55 |
+
processed_profession = re.sub(r'-', ' ', processed_profession)
|
56 |
+
# Получение эмбеддинга профессии
|
57 |
+
profession_embedding = get_embedding(processed_profession).numpy().reshape(1, -1)
|
58 |
+
|
59 |
+
# Вычисление косинусного сходства
|
60 |
+
similarity = cosine_similarity(profession_embedding, embeddings_nkz)
|
61 |
+
|
62 |
+
# Получаем индексы топ-N самых схожих профессий
|
63 |
+
top_n_idx = similarity[0].argsort()[-top_n:][::-1]
|
64 |
+
|
65 |
+
top_matches = []
|
66 |
+
for idx in top_n_idx:
|
67 |
+
top_matches.append({
|
68 |
+
'profession': profession,
|
69 |
+
'nkz_ru': nkz_list.iloc[idx]['NAME_RU'],
|
70 |
+
'nkz_kz': nkz_list.iloc[idx]['NAME_KZ'],
|
71 |
+
'nkz_code': nkz_list.iloc[idx]['CODE'],
|
72 |
+
'similarity': similarity[0][idx]
|
73 |
+
})
|
74 |
+
|
75 |
+
return pd.DataFrame(top_matches)
|
76 |
+
|
77 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
78 |
+
def generate(text, **kwargs):
|
79 |
+
inputs = tr_tokenizer(text, return_tensors='pt')
|
80 |
+
with torch.no_grad():
|
81 |
+
hypotheses = tr_model.generate(**inputs, num_beams=5, **kwargs)
|
82 |
+
return tr_tokenizer.decode(hypotheses[0], skip_special_tokens=True)
|
83 |
+
tr_model = AutoModelForSeq2SeqLM.from_pretrained('deepvk/kazRush-kk-ru')
|
84 |
+
tr_tokenizer = AutoTokenizer.from_pretrained('deepvk/kazRush-kk-ru')
|
85 |
+
|
86 |
+
|
87 |
+
import hunspell
|
88 |
+
|
89 |
+
def correct_text(text: str, dict_path: str = 'ru_RU.dic', aff_path: str = 'ru_RU.aff') -> str:
|
90 |
+
# Инициализация hunspell с указанием путей к словарям
|
91 |
+
h = hunspell.HunSpell(dict_path, aff_path)
|
92 |
+
|
93 |
+
# Разбиваем текст на слова
|
94 |
+
words = text.split()
|
95 |
+
|
96 |
+
# Проверяем и исправляем каждое слово
|
97 |
+
corrected_words = []
|
98 |
+
for word in words:
|
99 |
+
if not h.spell(word): # Если слово написано с ошибкой
|
100 |
+
suggestions = h.suggest(word) # Получаем список вариантов исправлений
|
101 |
+
if suggestions:
|
102 |
+
corrected_words.append(suggestions[0]) # Выбираем первое предложение
|
103 |
+
else:
|
104 |
+
corrected_words.append(word) # Если исправлений нет, оставляем как есть
|
105 |
+
else:
|
106 |
+
corrected_words.append(word) # Если слово правильное, оставляем его
|
107 |
+
|
108 |
+
# Собираем исправленный текст
|
109 |
+
corrected_text = ' '.join(corrected_words)
|
110 |
+
|
111 |
+
return corrected_text
|
112 |
+
|
113 |
+
import numpy as np
|
114 |
+
|
115 |
+
# Assuming nkz_list is your DataFrame loaded from 'nkz_list_emb.csv'
|
116 |
+
nkz_list = pd.read_csv('nkz_list_emb_df.csv')
|
117 |
+
|
118 |
+
import numpy as np
|
119 |
+
|
120 |
+
# Assuming nkz_list is your DataFrame loaded from 'nkz_list_emb.csv'
|
121 |
+
|
122 |
+
# Convert the 'EMB' column strings into actual lists of floats
|
123 |
+
nkz_list['EMB'] = nkz_list['EMB'].apply(lambda x: np.fromstring(x[1:-1], sep=', '))
|
124 |
+
|
125 |
+
# Now you can convert the lists of floats into a numpy array
|
126 |
+
embeddings_nkz_df = np.vstack(nkz_list['EMB'])
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
text1 = st.text_input("Enter the occupation to compare:", "Оператор пульта управления")
|
132 |
+
|
133 |
+
if st.button("Find Similar Occupations"):
|
134 |
+
try:
|
135 |
+
# Find similar occupations
|
136 |
+
results = find_best_matches(text1, nkz_df, embeddings_nkz_df, top_n=10)
|
137 |
+
# similar_texts = find_similar_combined(text1, k=5)
|
138 |
+
# similar_texts_df = pd.DataFrame(similar_texts)
|
139 |
+
# Display results
|
140 |
+
if results is not None:
|
141 |
+
st.write("Similar Occupations:")
|
142 |
+
st.dataframe(results)
|
143 |
+
# st.dataframe(similar_texts_df)
|
144 |
+
else:
|
145 |
+
st.warning("No similar occupations found.")
|
146 |
+
except Exception as e:
|
147 |
+
st.error(f"An error occurred: {e}")
|