Spaces:
Runtime error
Runtime error
import nltk | |
import pandas as pd | |
from nltk.corpus import stopwords | |
import re | |
import numpy as np | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline | |
from scipy.special import softmax | |
class sentimentAnalysis(): | |
def __init__(self, lang, text2analysePath): | |
self.lang = lang | |
self.text2analysePath = text2analysePath | |
self.engLabels = ["negative", "neutral", "positive"] | |
nltk.download("stopwords") | |
def downloadModels(self): | |
txtt = open(self.text2analysePath, 'r', encoding="utf-8") | |
if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english": | |
MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment" | |
self.tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
self.model = AutoModelForSequenceClassification.from_pretrained(MODEL) | |
self.model.save_pretrained(MODEL) | |
self.tokenizer.save_pretrained(MODEL) | |
self.engPrepareText(txtt) | |
elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish": | |
self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased") | |
self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased") | |
self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model) | |
self.trPrepareText(txtt) | |
else: | |
print("Dil bulunamadı!------The language has not been found!") | |
def engPrepareText(self, txtt): | |
a = [] | |
for i in txtt.readlines(): | |
i = i.lower() | |
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i) | |
spl = i.split(' ') | |
new_word = [word for word in spl if not word in set(stopwords.words("english"))] | |
a.append(' '.join(new_word)) | |
dFen = pd.DataFrame(a, columns=["texts"]) | |
self.engAnalyse(dFen) | |
def trPrepareText(self, txtt): | |
a = [] | |
for i in txtt.readlines(): | |
i = i.lower() | |
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i) | |
spl = i.split(' ') | |
new_word = [word for word in spl if not word in set(stopwords.words("turkish"))] | |
a.append(' '.join(new_word)) | |
dFtr = pd.DataFrame(a, columns=["metinler"]) | |
self.trAnalyse(dFtr) | |
def engAnalyse(self, dFen): | |
for i in range(len(dFen)): | |
text = dFen["texts"][i] | |
encoded_input = self.tokenizer(text, return_tensors='pt') | |
output = self.model(**encoded_input) | |
scores = output[0][0].detach().numpy() | |
scores = softmax(scores) | |
ranking = np.argsort(scores) | |
ranking = ranking[::-1] | |
print(f"text: {text}") | |
for i in range(scores.shape[0]): | |
l = self.engLabels[ranking[i]] | |
s = scores[ranking[i]] | |
print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}") | |
def trAnalyse(self, dFtr): | |
for i in range(len(dFtr)): | |
text = dFtr["metinler"][i] | |
p = self.sa(text)[0] | |
if p["label"] == "positive": | |
print(f"text: {text}") | |
print(f"1-) positive: {np.round(float(p['score']), 4)}") | |
print(f"2-) negative: {np.round(float(1 - p['score']), 4)}") | |
else: | |
print(f"text: {text}") | |
print(f"1-) positive: {np.round(float(1 - p['score']), 4)}") | |
print(f"2-) negative: {np.round(float(p['score']), 4)}") | |
lang = "ingilizce" | |
path = "texts/denemeler/text.txt" | |
sA = sentimentAnalysis(lang, path).downloadModels() | |