|
import re
|
|
import json
|
|
import string
|
|
import emoji
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
with open(r'chat_words.json','r') as f:
|
|
chat_words = json.load(f)
|
|
|
|
with open(r'number_meanings.json','r') as f:
|
|
number_meanings = json.load(f)
|
|
|
|
|
|
lemmatizer = WordNetLemmatizer()
|
|
def preprocess_text(text):
|
|
|
|
text = text.lower()
|
|
|
|
|
|
text = ' '.join(text.split())
|
|
|
|
|
|
text = re.sub(r'<.*?>', '', text)
|
|
|
|
|
|
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
|
|
|
|
|
new_text = []
|
|
for word in text.split():
|
|
if word.upper() in chat_words:
|
|
new_text.append(chat_words[word.upper()])
|
|
else:
|
|
new_text.append(word)
|
|
text = " ".join(new_text)
|
|
|
|
|
|
text = emoji.demojize(text, delimiters=("", ""))
|
|
|
|
|
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
|
|
|
|
|
tokens = word_tokenize(text)
|
|
|
|
|
|
tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
|
|
|
|
|
|
text = ' '.join(tokens)
|
|
|
|
return text |