Spaces:
Sleeping
Sleeping
#text_preprocessor.py | |
import pandas as pd | |
import re | |
import string | |
import pymorphy3 | |
from sklearn.base import BaseEstimator, TransformerMixin | |
class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin): | |
def __init__(self): | |
self.stop_words = self.get_stopwords_list() | |
self.morph = pymorphy3.MorphAnalyzer() | |
def fit(self, X, y=None): | |
return self | |
def transform(self, texts, y=None, lemmatize=True): | |
return [self.preprocess(text, lemmatize=lemmatize) for text in texts] | |
def get_stopwords_list(self): | |
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt" | |
stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords']) | |
return stopwords_cust | |
def clean(self, text): | |
text = text.lower() | |
text = re.sub(r'http\S+', " ", text) | |
text = re.sub(r'@\w+', ' ', text) | |
text = re.sub(r'#\w+', ' ', text) | |
text = re.sub(r'\d+', ' ', text) | |
text = re.sub(r'[^\w\s,]', '', text) | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
text = re.sub(r'<.*?>', ' ', text) | |
text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text) | |
text = re.sub(r'[a-zA-Z]', '', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def remove_stopwords(self, text): | |
return ' '.join([word for word in text.split() if word not in self.stop_words]) | |
def lemmatize(self, text): | |
morph = self.morph | |
lemmatized_text = '' | |
for word in text.split(): | |
lemmatized_text += morph.parse(word)[0].normal_form + " " | |
return lemmatized_text | |
def preprocess(self, text, lemmatize=True): | |
"""Общая функция обработки текста с возможностью отключить лемматизацию""" | |
text = self.clean(text) | |
text = self.remove_stopwords(text) | |
if lemmatize: | |
text = self.lemmatize(text) | |
return text |