Spaces:
Sleeping
Sleeping
Commit
·
e1603e5
1
Parent(s):
5d3374e
corrections
Browse files- app.py +182 -138
- sample_file.xlsx +0 -0
app.py
CHANGED
@@ -120,8 +120,55 @@ def analyze_sentiment(text):
|
|
120 |
return "Positive"
|
121 |
return "Neutral"
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
def fuzzy_deduplicate(df, column, threshold=
|
125 |
seen_texts = []
|
126 |
indices_to_keep = []
|
127 |
for i, text in enumerate(df[column]):
|
@@ -273,96 +320,64 @@ def generate_sentiment_visualization(df):
|
|
273 |
return fig
|
274 |
|
275 |
def process_file(uploaded_file, model_choice):
|
276 |
-
|
277 |
-
old_stdout = sys.stdout
|
278 |
-
#sys.stdout = output_capture
|
279 |
-
|
280 |
try:
|
281 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
282 |
llm = init_langchain_llm(model_choice)
|
|
|
|
|
283 |
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
284 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
285 |
if missing_columns:
|
286 |
-
st.error(f"Error: The following required columns are missing
|
287 |
-
|
288 |
-
|
289 |
-
# Initialize LLM
|
290 |
-
llm = init_langchain_llm(model_choice)
|
291 |
-
if not llm:
|
292 |
-
st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
|
293 |
-
st.stop()
|
294 |
-
|
295 |
-
# Deduplication
|
296 |
-
original_news_count = len(df)
|
297 |
-
df = df.groupby('Объект', group_keys=False).apply(
|
298 |
-
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
299 |
-
).reset_index(drop=True)
|
300 |
-
|
301 |
-
remaining_news_count = len(df)
|
302 |
-
duplicates_removed = original_news_count - remaining_news_count
|
303 |
-
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
304 |
|
305 |
-
# Initialize progress
|
306 |
progress_bar = st.progress(0)
|
307 |
status_text = st.empty()
|
308 |
-
|
309 |
-
#
|
310 |
df['Translated'] = ''
|
311 |
df['Sentiment'] = ''
|
312 |
df['Impact'] = ''
|
313 |
df['Reasoning'] = ''
|
314 |
-
|
315 |
-
|
316 |
-
translated_text = translate_text(llm, row['Выдержки из текста'])
|
317 |
-
df.at[index, 'Translated'] = translated_text
|
318 |
-
|
319 |
-
sentiment = analyze_sentiment(translated_text)
|
320 |
-
df.at[index, 'Sentiment'] = sentiment
|
321 |
-
|
322 |
-
if sentiment == "Negative":
|
323 |
-
impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
|
324 |
-
df.at[index, 'Impact'] = impact
|
325 |
-
df.at[index, 'Reasoning'] = reasoning
|
326 |
-
|
327 |
-
# Update progress
|
328 |
-
progress = (index + 1) / len(df)
|
329 |
-
progress_bar.progress(progress)
|
330 |
-
status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
|
331 |
-
|
332 |
-
# Display results with color coding
|
333 |
-
display_sentiment_results(row, sentiment,
|
334 |
-
impact if sentiment == "Negative" else None,
|
335 |
-
reasoning if sentiment == "Negative" else None)
|
336 |
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
return df
|
361 |
|
362 |
except Exception as e:
|
363 |
-
sys.stdout = old_stdout
|
364 |
st.error(f"❌ Ошибка при обработке файла: {str(e)}")
|
365 |
-
|
366 |
|
367 |
def create_analysis_data(df):
|
368 |
analysis_data = []
|
@@ -388,74 +403,90 @@ def create_analysis_data(df):
|
|
388 |
def create_output_file(df, uploaded_file, llm):
|
389 |
wb = load_workbook("sample_file.xlsx")
|
390 |
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
ws.cell(row=
|
423 |
-
ws.cell(row=
|
424 |
-
ws.cell(row=
|
425 |
-
ws.cell(row=
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
|
|
|
|
|
|
|
|
|
|
442 |
|
443 |
-
#
|
444 |
-
|
445 |
-
|
446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
|
448 |
-
|
449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
|
451 |
-
|
452 |
-
|
453 |
-
if 'Тех.приложение' not in wb.sheetnames:
|
454 |
-
wb.create_sheet('Тех.приложение')
|
455 |
-
ws = wb['Тех.приложение']
|
456 |
-
for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
|
457 |
-
for c_idx, value in enumerate(row, start=1):
|
458 |
-
ws.cell(row=r_idx, column=c_idx, value=value)
|
459 |
|
460 |
output = io.BytesIO()
|
461 |
wb.save(output)
|
@@ -464,7 +495,7 @@ def create_output_file(df, uploaded_file, llm):
|
|
464 |
|
465 |
def main():
|
466 |
with st.sidebar:
|
467 |
-
st.title("::: AI-анализ мониторинга новостей (v.3.
|
468 |
st.subheader("по материалам СКАН-ИНТЕРФАКС ")
|
469 |
|
470 |
model_choice = st.radio(
|
@@ -532,6 +563,19 @@ def main():
|
|
532 |
preview_df = st.session_state.processed_df[['Объект', 'Заголовок', 'Sentiment', 'Impact']].head()
|
533 |
st.dataframe(preview_df)
|
534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
analysis_df = create_analysis_data(st.session_state.processed_df)
|
536 |
st.subheader("Анализ")
|
537 |
st.dataframe(analysis_df)
|
|
|
120 |
return "Positive"
|
121 |
return "Neutral"
|
122 |
|
123 |
+
def analyze_sentiment(text):
|
124 |
+
finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
|
125 |
+
roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
|
126 |
+
finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
|
127 |
+
|
128 |
+
# Count occurrences of each sentiment
|
129 |
+
sentiments = [finbert_result, roberta_result, finbert_tone_result]
|
130 |
+
sentiment_counts = {s: sentiments.count(s) for s in set(sentiments)}
|
131 |
+
|
132 |
+
# Return sentiment if at least two models agree, otherwise return Neutral
|
133 |
+
for sentiment, count in sentiment_counts.items():
|
134 |
+
if count >= 2:
|
135 |
+
return sentiment
|
136 |
+
return "Neutral"
|
137 |
+
|
138 |
+
|
139 |
+
def detect_events(llm, text, entity):
|
140 |
+
template = """
|
141 |
+
Проанализируйте следующую новость о компании "{entity}" и определите наличие следующих событий:
|
142 |
+
1. Публикация отчетности и ключевые показатели (выручка, прибыль, EBITDA)
|
143 |
+
2. События на рынке ценных бумаг (погашение облигаций, выплата/невыплата купона, дефолт, реструктуризация)
|
144 |
+
3. Судебные иски или юридические действия против компании, акционеров, менеджеров
|
145 |
+
|
146 |
+
Новость: {text}
|
147 |
+
|
148 |
+
Ответьте в следующем формате:
|
149 |
+
Тип: ["Отчетность" или "РЦБ" или "Суд" или "Нет"]
|
150 |
+
Краткое описание: [краткое описание события на русском языке, не более 2 предложений]
|
151 |
+
"""
|
152 |
+
|
153 |
+
prompt = PromptTemplate(template=template, input_variables=["entity", "text"])
|
154 |
+
chain = prompt | llm
|
155 |
+
response = chain.invoke({"entity": entity, "text": text})
|
156 |
+
|
157 |
+
event_type = "Нет"
|
158 |
+
summary = ""
|
159 |
+
|
160 |
+
try:
|
161 |
+
response_text = response.content if hasattr(response, 'content') else str(response)
|
162 |
+
if "Тип:" in response_text and "Краткое описание:" in response_text:
|
163 |
+
type_part, summary_part = response_text.split("Краткое описание:")
|
164 |
+
event_type = type_part.split("Тип:")[1].strip()
|
165 |
+
summary = summary_part.strip()
|
166 |
+
except Exception as e:
|
167 |
+
st.warning(f"Ошибка при анализе событий: {str(e)}")
|
168 |
+
|
169 |
+
return event_type, summary
|
170 |
|
171 |
+
def fuzzy_deduplicate(df, column, threshold=50):
|
172 |
seen_texts = []
|
173 |
indices_to_keep = []
|
174 |
for i, text in enumerate(df[column]):
|
|
|
320 |
return fig
|
321 |
|
322 |
def process_file(uploaded_file, model_choice):
|
323 |
+
df = None
|
|
|
|
|
|
|
324 |
try:
|
325 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
326 |
llm = init_langchain_llm(model_choice)
|
327 |
+
|
328 |
+
# Validate required columns
|
329 |
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
330 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
331 |
if missing_columns:
|
332 |
+
st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
|
333 |
+
return df if df is not None else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
+
# Initialize progress tracking
|
336 |
progress_bar = st.progress(0)
|
337 |
status_text = st.empty()
|
338 |
+
|
339 |
+
# Initialize new columns
|
340 |
df['Translated'] = ''
|
341 |
df['Sentiment'] = ''
|
342 |
df['Impact'] = ''
|
343 |
df['Reasoning'] = ''
|
344 |
+
df['Event_Type'] = ''
|
345 |
+
df['Event_Summary'] = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
+
# Process each news item
|
348 |
+
for index, row in df.iterrows():
|
349 |
+
try:
|
350 |
+
# Translate and analyze sentiment
|
351 |
+
translated_text = translate_text(llm, row['Выдержки из текста'])
|
352 |
+
df.at[index, 'Translated'] = translated_text
|
353 |
+
|
354 |
+
sentiment = analyze_sentiment(translated_text)
|
355 |
+
df.at[index, 'Sentiment'] = sentiment
|
356 |
+
|
357 |
+
# Detect events
|
358 |
+
event_type, event_summary = detect_events(llm, row['Выдержки из текста'], row['Объект'])
|
359 |
+
df.at[index, 'Event_Type'] = event_type
|
360 |
+
df.at[index, 'Event_Summary'] = event_summary
|
361 |
+
|
362 |
+
if sentiment == "Negative":
|
363 |
+
impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
|
364 |
+
df.at[index, 'Impact'] = impact
|
365 |
+
df.at[index, 'Reasoning'] = reasoning
|
366 |
+
|
367 |
+
# Update progress
|
368 |
+
progress = (index + 1) / len(df)
|
369 |
+
progress_bar.progress(progress)
|
370 |
+
status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
|
371 |
+
|
372 |
+
except Exception as e:
|
373 |
+
st.warning(f"Ошибка при обработке новости {index + 1}: {str(e)}")
|
374 |
+
continue
|
375 |
|
376 |
return df
|
377 |
|
378 |
except Exception as e:
|
|
|
379 |
st.error(f"❌ Ошибка при обработке файла: {str(e)}")
|
380 |
+
return df if df is not None else None
|
381 |
|
382 |
def create_analysis_data(df):
|
383 |
analysis_data = []
|
|
|
403 |
def create_output_file(df, uploaded_file, llm):
|
404 |
wb = load_workbook("sample_file.xlsx")
|
405 |
|
406 |
+
try:
|
407 |
+
# Update 'Мониторинг' sheet with events
|
408 |
+
ws = wb['Мониторинг']
|
409 |
+
row_idx = 4
|
410 |
+
for _, row in df.iterrows():
|
411 |
+
if row['Event_Type'] != 'Нет':
|
412 |
+
ws.cell(row=row_idx, column=5, value=row['Объект']) # Column E
|
413 |
+
ws.cell(row=row_idx, column=6, value=row['Заголовок']) # Column F
|
414 |
+
ws.cell(row=row_idx, column=7, value=row['Event_Type']) # Column G
|
415 |
+
ws.cell(row=row_idx, column=8, value=row['Event_Summary']) # Column H
|
416 |
+
ws.cell(row=row_idx, column=9, value=row['Выдержки из текста']) # Column I
|
417 |
+
row_idx += 1
|
418 |
+
|
419 |
+
# Sort entities by number of negative publications
|
420 |
+
entity_stats = pd.DataFrame({
|
421 |
+
'Объект': df['Объект'].unique(),
|
422 |
+
'Всего': df.groupby('Объект').size(),
|
423 |
+
'Негативные': df[df['Sentiment'] == 'Negative'].groupby('Объект').size().fillna(0).astype(int),
|
424 |
+
'Позитивные': df[df['Sentiment'] == 'Positive'].groupby('Объект').size().fillna(0).astype(int)
|
425 |
+
}).sort_values('Негативные', ascending=False)
|
426 |
+
|
427 |
+
# Calculate most negative impact for each entity
|
428 |
+
entity_impacts = {}
|
429 |
+
for entity in df['Объект'].unique():
|
430 |
+
entity_df = df[df['Объект'] == entity]
|
431 |
+
negative_impacts = entity_df[entity_df['Sentiment'] == 'Negative']['Impact']
|
432 |
+
entity_impacts[entity] = negative_impacts.iloc[0] if len(negative_impacts) > 0 else 'Неопределенный эффект'
|
433 |
+
|
434 |
+
# Update 'Сводка' sheet
|
435 |
+
ws = wb['Сводка']
|
436 |
+
for idx, (entity, row) in enumerate(entity_stats.iterrows(), start=4):
|
437 |
+
ws.cell(row=idx, column=5, value=entity) # Column E
|
438 |
+
ws.cell(row=idx, column=6, value=row['Всего']) # Column F
|
439 |
+
ws.cell(row=idx, column=7, value=row['Негативные']) # Column G
|
440 |
+
ws.cell(row=idx, column=8, value=row['Позитивные']) # Column H
|
441 |
+
ws.cell(row=idx, column=9, value=entity_impacts[entity]) # Column I
|
442 |
+
|
443 |
+
# Update 'Значимые' sheet
|
444 |
+
ws = wb['Значимые']
|
445 |
+
row_idx = 3
|
446 |
+
for _, row in df.iterrows():
|
447 |
+
if row['Sentiment'] in ['Negative', 'Positive']:
|
448 |
+
ws.cell(row=row_idx, column=3, value=row['Объект']) # Column C
|
449 |
+
ws.cell(row=row_idx, column=4, value='релевантно') # Column D
|
450 |
+
ws.cell(row=row_idx, column=5, value=row['Sentiment']) # Column E
|
451 |
+
ws.cell(row=row_idx, column=6, value=row['Impact']) # Column F
|
452 |
+
ws.cell(row=row_idx, column=7, value=row['Заголовок']) # Column G
|
453 |
+
ws.cell(row=row_idx, column=8, value=row['Выдержки из текста']) # Column H
|
454 |
+
row_idx += 1
|
455 |
+
|
456 |
+
# Copy 'Публикации' sheet
|
457 |
+
original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
458 |
+
ws = wb['Публикации']
|
459 |
+
for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
|
460 |
+
for c_idx, value in enumerate(row, start=1):
|
461 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
462 |
|
463 |
+
# Update 'Анализ' sheet
|
464 |
+
ws = wb['Анализ']
|
465 |
+
row_idx = 4
|
466 |
+
for _, row in df[df['Sentiment'] == 'Negative'].iterrows():
|
467 |
+
ws.cell(row=row_idx, column=5, value=row['Объект']) # Column E
|
468 |
+
ws.cell(row=row_idx, column=6, value=row['Заголовок']) # Column F
|
469 |
+
ws.cell(row=row_idx, column=7, value="Риск убытка") # Column G
|
470 |
+
|
471 |
+
# Translate reasoning if it exists
|
472 |
+
if pd.notna(row['Reasoning']):
|
473 |
+
translated_reasoning = translate_reasoning_to_russian(llm, row['Reasoning'])
|
474 |
+
ws.cell(row=row_idx, column=8, value=translated_reasoning) # Column H
|
475 |
+
|
476 |
+
ws.cell(row=row_idx, column=9, value=row['Выдержки из текста']) # Column I
|
477 |
+
row_idx += 1
|
478 |
|
479 |
+
# Update 'Тех.приложение' sheet
|
480 |
+
tech_df = df[['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']]
|
481 |
+
if 'Тех.приложение' not in wb.sheetnames:
|
482 |
+
wb.create_sheet('Тех.приложение')
|
483 |
+
ws = wb['Тех.приложение']
|
484 |
+
for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
|
485 |
+
for c_idx, value in enumerate(row, start=1):
|
486 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
487 |
|
488 |
+
except Exception as e:
|
489 |
+
st.warning(f"Ошибка при создании выходного файла: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
|
491 |
output = io.BytesIO()
|
492 |
wb.save(output)
|
|
|
495 |
|
496 |
def main():
|
497 |
with st.sidebar:
|
498 |
+
st.title("::: AI-анализ мониторинга новостей (v.3.30):::")
|
499 |
st.subheader("по материалам СКАН-ИНТЕРФАКС ")
|
500 |
|
501 |
model_choice = st.radio(
|
|
|
563 |
preview_df = st.session_state.processed_df[['Объект', 'Заголовок', 'Sentiment', 'Impact']].head()
|
564 |
st.dataframe(preview_df)
|
565 |
|
566 |
+
# Add preview of Monitoring results
|
567 |
+
st.subheader("Предпросмотр мониторинга событий и риск-факторов эмитентов")
|
568 |
+
monitoring_df = st.session_state.processed_df[
|
569 |
+
(st.session_state.processed_df['Event_Type'] != 'Нет') &
|
570 |
+
(st.session_state.processed_df['Event_Type'].notna())
|
571 |
+
][['Объект', 'Заголовок', 'Event_Type', 'Event_Summary']].head()
|
572 |
+
|
573 |
+
if len(monitoring_df) > 0:
|
574 |
+
st.dataframe(monitoring_df)
|
575 |
+
else:
|
576 |
+
st.info("Не обнаружено значимых событий для мониторинга")
|
577 |
+
|
578 |
+
|
579 |
analysis_df = create_analysis_data(st.session_state.processed_df)
|
580 |
st.subheader("Анализ")
|
581 |
st.dataframe(analysis_df)
|
sample_file.xlsx
CHANGED
Binary files a/sample_file.xlsx and b/sample_file.xlsx differ
|
|