#text_preprocessor.py import pandas as pd import re import string import pymorphy3 from sklearn.base import BaseEstimator, TransformerMixin class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin): def __init__(self): self.stop_words = self.get_stopwords_list() self.morph = pymorphy3.MorphAnalyzer() def fit(self, X, y=None): return self def transform(self, texts, y=None, lemmatize=True): return [self.preprocess(text, lemmatize=lemmatize) for text in texts] def get_stopwords_list(self): url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt" stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords']) return stopwords_cust def clean(self, text): text = text.lower() text = re.sub(r'http\S+', " ", text) text = re.sub(r'@\w+', ' ', text) text = re.sub(r'#\w+', ' ', text) text = re.sub(r'\d+', ' ', text) text = re.sub(r'[^\w\s,]', '', text) text = text.translate(str.maketrans('', '', string.punctuation)) text = re.sub(r'<.*?>', ' ', text) text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text) text = re.sub(r'[a-zA-Z]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def remove_stopwords(self, text): return ' '.join([word for word in text.split() if word not in self.stop_words]) def lemmatize(self, text): morph = self.morph lemmatized_text = '' for word in text.split(): lemmatized_text += morph.parse(word)[0].normal_form + " " return lemmatized_text def preprocess(self, text, lemmatize=True): """Общая функция обработки текста с возможностью отключить лемматизацию""" text = self.clean(text) text = self.remove_stopwords(text) if lemmatize: text = self.lemmatize(text) return text