nlp_group_project / models /text_preprocessor.py
DanilO0o's picture
added new model
edcd390
#text_preprocessor.py
import pandas as pd
import re
import string
import pymorphy3
from sklearn.base import BaseEstimator, TransformerMixin
class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = self.get_stopwords_list()
self.morph = pymorphy3.MorphAnalyzer()
def fit(self, X, y=None):
return self
def transform(self, texts, y=None, lemmatize=True):
return [self.preprocess(text, lemmatize=lemmatize) for text in texts]
def get_stopwords_list(self):
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"
stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords'])
return stopwords_cust
def clean(self, text):
text = text.lower()
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'@\w+', ' ', text)
text = re.sub(r'#\w+', ' ', text)
text = re.sub(r'\d+', ' ', text)
text = re.sub(r'[^\w\s,]', '', text)
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub(r'<.*?>', ' ', text)
text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text)
text = re.sub(r'[a-zA-Z]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def remove_stopwords(self, text):
return ' '.join([word for word in text.split() if word not in self.stop_words])
def lemmatize(self, text):
morph = self.morph
lemmatized_text = ''
for word in text.split():
lemmatized_text += morph.parse(word)[0].normal_form + " "
return lemmatized_text
def preprocess(self, text, lemmatize=True):
"""Общая функция обработки текста с возможностью отключить лемматизацию"""
text = self.clean(text)
text = self.remove_stopwords(text)
if lemmatize:
text = self.lemmatize(text)
return text