Hate-Speech-Model / preprocessing.py
Kaala741's picture
Upload 7 files
8a86e97 verified
import re
import json
import string
import emoji
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Load chat words and number meanings if needed
with open(r'chat_words.json','r') as f:
chat_words = json.load(f)
with open(r'number_meanings.json','r') as f:
number_meanings = json.load(f)
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
# Lowercase text
text = text.lower()
# Clean spaces
text = ' '.join(text.split())
# Remove HTML tags
text = re.sub(r'<.*?>', '', text)
# Remove URLs
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# Replace chat words
new_text = []
for word in text.split():
if word.upper() in chat_words:
new_text.append(chat_words[word.upper()])
else:
new_text.append(word)
text = " ".join(new_text)
# Convert emojis to text
text = emoji.demojize(text, delimiters=("", ""))
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize text
tokens = word_tokenize(text)
# Lemmatize tokens
tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
# Join tokens back into sentence
text = ' '.join(tokens)
return text