import streamlit as st |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification |
from GoogleNews import GoogleNews |
import matplotlib.pyplot as plt |
import seaborn as sns |
import pandas as pd |
import plotly.express as px |
import os |
import datetime |
from dotenv import load_dotenv |
import requests |
load_dotenv() |
st.title('NLP系统展示:sunglasses:') |
st.subheader("新闻搜索和情感分类") |
def get_news(query): |
googlenews = GoogleNews(lang='en', region='US', period='1d') |
number_of_pages = 5 |
final_list = [] |
googlenews.search(query) |
print("Total Pages: ", googlenews.total_count()) |
for page in range(1, number_of_pages + 1): |
page_result = googlenews.page_at(page) |
final_list = final_list + page_result |
return final_list |
query = st.text_input("输入关键字") |
if st.button("搜索"): |
with st.spinner("正在加载模型 ..."): |
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) |
with st.spinner("正在加载最新新闻 ..."): |
allnews = get_news(query) |
with st.spinner("最新新闻已收到, 分析情绪中 ..."): |
df = pd.DataFrame(columns=["sentence", "date","best","second"]) |
for curnews in allnews: |
cur_sentence = curnews["title"] |
cur_date = curnews["date"] |
model_outputs = classifier(cur_sentence) |
cur_result = model_outputs[0] |
label = cur_result[0]['label'] |
score = cur_result[0]['score'] |
percentage = round(score * 100, 2) |
str1 = label + " (" + str(percentage) + ")%" |
label = cur_result[1]['label'] |
score = cur_result[1]['score'] |
percentage = round(score * 100, 2) |
str2 = label + " (" + str(percentage) + ")%" |
df.loc[len(df.index)] = [cur_sentence, cur_date, str1, str2] |
st.dataframe(df) |
emotion_counts = df['best'].apply(lambda x: x.split(" ")[0]).value_counts() |
st.subheader("情绪分析柱状图") |
fig, ax = plt.subplots() |
sns.barplot(x=emotion_counts.index, y=emotion_counts.values, ax=ax) |
plt.xticks(rotation=45, ha='right') |
plt.ylabel('Frequency') |
plt.xlabel('Emotions') |
st.pyplot(fig) |
st.subheader("情绪分析饼状图") |
fig, ax = plt.subplots() |
ax.pie(emotion_counts.values, labels=emotion_counts.index, autopct='%1.1f%%', startangle=90) |
ax.axis('equal') |
st.pyplot(fig) |
st.subheader("论坛情感统计") |
@st.cache_resource |
def load_model_and_tokenizer(): |
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en") |
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en") |
return tokenizer, model |
tokenizer, model_zh_en = load_model_and_tokenizer() |
@st.cache_data(show_spinner=False) |
def get_comments(time, max_comments=99999): |
url = os.getenv("API_URL") + f"?day={time}" |
token = os.getenv("API_TOKEN") |
headers = {"Authorization": f"token {token}"} |
response = requests.get(url, headers=headers) |
if response.status_code == 200: |
comments = response.json() |
df = pd.DataFrame(comments) |
df = df.head(max_comments) |
return df |
else: |
print(f"Failed to retrieve data: {response.status_code}") |
st.error("Failed to load comments.") |
return None |
@st.cache_data(show_spinner=False) |
def batch_translate(batch_texts, _model, _tokenizer): |
inputs = _tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512) |
outputs = _model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True) |
batch_translations = [_tokenizer.decode(output, skip_special_tokens=True) for output in outputs] |
return batch_translations |
def display_translations(comments, batch_size=16): |
if comments is not None: |
placeholder = st.empty() |
for i in range(0, len(comments['content']), batch_size): |
batch_texts = comments['content'][i:i+batch_size].tolist() |
translations = batch_translate(batch_texts, model_zh_en, tokenizer) |
comments.loc[i:i+batch_size-1, 'content_en'] = translations |
placeholder.dataframe(comments[['content', 'content_en']][:i+batch_size]) |
return comments[['content', 'content_en']] |
else: |
st.error("Failed to load comments.") |
@st.cache_data(show_spinner=False) |
def batch_sentiment_analysis(batch_texts, _classifier): |
model_outputs = _classifier(batch_texts) |
return model_outputs |
def display_sentiments(translate_comments, batch_size=16): |
if translate_comments is not None: |
placeholder = st.empty() |
df_sentiments = pd.DataFrame(columns=["content", "translation", "emotion", "score"]) |
for i in range(0, len(translate_comments), batch_size): |
batch_texts = translate_comments['content_en'][i:i+batch_size].tolist() |
model_outputs = batch_sentiment_analysis(batch_texts, classifier) |
for j, output in enumerate(model_outputs): |
best_emotion = output[0]['label'] |
best_score = output[0]['score'] |
df_sentiments.loc[len(df_sentiments.index)] = [translate_comments['content'][i+j], translate_comments['content_en'][i+j], best_emotion, best_score] |
placeholder.dataframe(df_sentiments) |
emotion_counts = df_sentiments['emotion'].value_counts() |
emotion_counts = df_sentiments['emotion'].value_counts() |
bar_fig = px.bar(emotion_counts, x=emotion_counts.index, y=emotion_counts.values, |
labels={'index': 'Emotions', 'value': 'Frequency'}, |
title="论坛情感分析柱状图") |
st.plotly_chart(bar_fig, use_container_width=True) |
pie_fig = px.pie(emotion_counts, names=emotion_counts.index, values=emotion_counts.values, |
title="论坛情感分析饼状图", hole=.3) |
st.plotly_chart(pie_fig, use_container_width=True) |
else: |
st.error("Failed to analyze sentiments.") |
cur_date = datetime.datetime.now().strftime("%Y-%m-%d") |
selected_date = st.date_input("选择日期", value=pd.to_datetime(cur_date)) |
model_translate = [ |
"Helsinki-NLP/opus-mt-zh-en" |
] |
model_emo_analysis = [ |
"orlco/google-bert-base-cased-fine-tune", |
"SamLowe/roberta-base-go_emotions" |
] |
settings = { |
"max_comments": 99999, |
"translate_batch_size": 16, |
"sentiment_batch_size": 16, |
"model_translate": model_translate[0], |
"model_emo_analysis": model_emo_analysis[0] |
} |
with st.sidebar: |
st.title("设置") |
st.header("翻译模型") |
settings["model_translate"] = st.selectbox("Model", model_translate) |
st.header("情感分析模型") |
settings["model_emo_analysis"] = st.selectbox("Model", model_emo_analysis) |
st.header("最大获取帖子数") |
settings["max_comments"] = st.number_input("Max Comments", 1, 99999, 99999) |
st.header("翻译批处理大小") |
settings["translate_batch_size"] = st.number_input("Translate Batch Size", 1, 64, 16) |
st.header("情感分析批处理大小") |
settings["sentiment_batch_size"] = st.number_input("Sentiment Analysis Batch Size", 1, 64, 16) |
if st.button("统计"): |
with st.spinner("正在加载模型 ..."): |
classifier = pipeline(task="text-classification", model=settings["model_emo_analysis"], top_k=None) |
with st.spinner("正在获取当天的帖子 ..."): |
comments = get_comments(selected_date, settings["max_comments"]) |
st.dataframe(comments) |
with st.spinner("正在翻译帖子 ..."): |
translate_comments = display_translations(comments, settings["translate_batch_size"]) |
with st.spinner("正在分析评论的情感倾向 ..."): |
display_sentiments(translate_comments, settings["sentiment_batch_size"]) |