import streamlit as st
import pandas as pd
import time
import matplotlib.pyplot as plt
from openpyxl.utils.dataframe import dataframe_to_rows
import io
from rapidfuzz import fuzz
import os
from openpyxl import load_workbook
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from io import StringIO, BytesIO
import sys
import contextlib
from langchain_openai import ChatOpenAI # Updated import
import pdfkit
from jinja2 import Template
import time
from tenacity import retry, stop_after_attempt, wait_exponential
from typing import Optional
import torch
from transformers import (
pipeline,
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoModelForCausalLM # 4 Qwen
)
from threading import Event
import threading
from queue import Queue
from deep_translator import GoogleTranslator
from googletrans import Translator as LegacyTranslator
import plotly.graph_objects as go
from datetime import datetime
import plotly.express as px
class ProcessControl:
def __init__(self):
self.pause_event = Event()
self.stop_event = Event()
self.pause_event.set() # Start in non-paused state
def pause(self):
self.pause_event.clear()
def resume(self):
self.pause_event.set()
def stop(self):
self.stop_event.set()
self.pause_event.set() # Ensure not stuck in pause
def reset(self):
self.stop_event.clear()
self.pause_event.set()
def is_paused(self):
return not self.pause_event.is_set()
def is_stopped(self):
return self.stop_event.is_set()
def wait_if_paused(self):
self.pause_event.wait()
class FallbackLLMSystem:
def __init__(self):
"""Initialize fallback models for event detection and reasoning"""
try:
# Initialize MT5 model (multilingual T5)
self.model_name = "google/mt5-small"
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
# Set device
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
st.success(f"пока все в порядке: запущена MT5 model на = {self.device} =")
except Exception as e:
st.error(f"Ошибка запуска модели MT5: {str(e)}")
raise
def invoke(self, prompt_args):
"""Make the class compatible with LangChain by implementing invoke"""
try:
if isinstance(prompt_args, dict):
# Extract the prompt template result
template_result = prompt_args.get('template_result', '')
if not template_result:
# Try to construct from entity and news if available
entity = prompt_args.get('entity', '')
news = prompt_args.get('news', '')
template_result = f"Analyze news about {entity}: {news}"
else:
template_result = str(prompt_args)
# Process with MT5
inputs = self.tokenizer(
template_result,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(self.device)
outputs = self.model.generate(
**inputs,
max_length=200,
num_return_sequences=1,
do_sample=False,
pad_token_id=self.tokenizer.pad_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Return in a format compatible with LangChain
return type('Response', (), {'content': response})()
except Exception as e:
st.warning(f"MT5 generation error: {str(e)}")
# Return a default response on error
return type('Response', (), {
'content': 'Impact: Неопределенный эффект\nReasoning: Ошибка анализа'
})()
def __or__(self, other):
"""Implement the | operator for chain compatibility"""
if callable(other):
return lambda x: other(self(x))
return NotImplemented
def __rrshift__(self, other):
"""Implement the >> operator for chain compatibility"""
return self.__or__(other)
def __call__(self, prompt_args):
"""Make the class callable for chain compatibility"""
return self.invoke(prompt_args)
def detect_events(self, text: str, entity: str) -> tuple[str, str]:
"""
Detect events using MT5 with improved error handling and response parsing
Args:
text (str): The news text to analyze
entity (str): The company/entity name
Returns:
tuple[str, str]: (event_type, summary)
"""
# Initialize default return values
event_type = "Нет"
summary = ""
# Input validation
if not text or not entity or not isinstance(text, str) or not isinstance(entity, str):
return event_type, "Invalid input"
try:
# Clean and prepare input text
text = text.strip()
entity = entity.strip()
# Construct prompt with better formatting
prompt = f"""Analyze the following news about {entity}:
Text: {text}
Task: Identify the main event type and provide a brief summary.
Event types:
1. Отчетность - Events related to financial reports, earnings, revenue, EBITDA
2. РЦБ - Events related to securities, bonds, stock market, defaults, restructuring
3. Суд - Events related to legal proceedings, lawsuits, arbitration
4. Нет - No significant events detected
Required output format:
Тип: [event type]
Краткое описание: [1-2 sentence summary]"""
# Process with MT5
try:
inputs = self.tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(self.device)
outputs = self.model.generate(
**inputs,
max_length=300, # Increased for better summaries
num_return_sequences=1,
do_sample=False,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
no_repeat_ngram_size=3 # Prevent repetition
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
except torch.cuda.OutOfMemoryError:
st.warning("GPU memory exceeded, falling back to CPU")
self.model = self.model.to('cpu')
inputs = inputs.to('cpu')
outputs = self.model.generate(
**inputs,
max_length=300,
num_return_sequences=1,
do_sample=False,
pad_token_id=self.tokenizer.pad_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
self.model = self.model.to(self.device) # Move back to GPU
# Enhanced response parsing
if "Тип:" in response and "Краткое описание:" in response:
try:
# Split and clean parts
parts = response.split("Краткое описание:")
type_part = parts[0].split("Тип:")[1].strip()
# Validate event type with fuzzy matching
valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
# Check for exact matches first
if type_part in valid_types:
event_type = type_part
else:
# Check keywords for each type
keywords = {
"Отчетность": ["отчет", "выручка", "прибыль", "ebitda", "финанс"],
"РЦБ": ["облигаци", "купон", "дефолт", "реструктуризац", "ценные бумаги"],
"Суд": ["суд", "иск", "арбитраж", "разбирательств"]
}
# Look for keywords in both type and summary
full_text = response.lower()
for event_category, category_keywords in keywords.items():
if any(keyword in full_text for keyword in category_keywords):
event_type = event_category
break
# Extract and clean summary
if len(parts) > 1:
summary = parts[1].strip()
# Ensure summary isn't too long
if len(summary) > 200:
summary = summary[:197] + "..."
# Add entity reference if missing
if entity.lower() not in summary.lower():
summary = f"Компания {entity}: {summary}"
except IndexError:
st.warning("Error parsing model response format")
return "Нет", "Error parsing response"
# Additional validation
if not summary or len(summary) < 5:
keywords = {
"Отчетность": "Обнаружена информация о финансовой отчетности",
"РЦБ": "Обнаружена информация о ценных бумагах",
"Суд": "Обнаружена информация о судебном разбирательстве",
"Нет": "Значимых событий не обнаружено"
}
summary = f"{keywords.get(event_type, 'Требуется дополнительный анализ')} ({entity})"
return event_type, summary
except Exception as e:
st.warning(f"Event detection error: {str(e)}")
# Try to provide more specific error information
if "CUDA" in str(e):
return "Нет", "GPU error - falling back to CPU needed"
elif "tokenizer" in str(e):
return "Нет", "Text processing error"
elif "model" in str(e):
return "Нет", "Model inference error"
else:
return "Нет", "Ошибка анализа"
def ensure_groq_llm():
"""Initialize Groq LLM for impact estimation"""
try:
if 'groq_key' not in st.secrets:
st.error("Groq API key not found in secrets. Please add it with the key 'groq_key'.")
return None
return ChatOpenAI(
base_url="https://api.groq.com/openai/v1",
model="llama-3.1-70b-versatile",
openai_api_key=st.secrets['groq_key'],
temperature=0.0
)
except Exception as e:
st.error(f"Error initializing Groq LLM: {str(e)}")
return None
def estimate_impact(llm, news_text, entity):
"""
Estimate impact using Groq LLM regardless of the main model choice.
Falls back to the provided LLM if Groq initialization fails.
"""
# Initialize default return values
impact = "Неопределенный эффект"
reasoning = "Не удалось получить обоснование"
try:
# Always try to use Groq first
groq_llm = ensure_groq_llm()
working_llm = groq_llm if groq_llm is not None else llm
template = """
You are a financial analyst. Analyze this news piece about {entity} and assess its potential impact.
News: {news}
Classify the impact into one of these categories:
1. "Значительный риск убытков" (Significant loss risk)
2. "Умеренный риск убытков" (Moderate loss risk)
3. "Незначительный риск убытков" (Minor loss risk)
4. "Вероятность прибыли" (Potential profit)
5. "Неопределенный эффект" (Uncertain effect)
Provide a brief, fact-based reasoning for your assessment.
Format your response exactly as:
Impact: [category]
Reasoning: [explanation in 2-3 sentences]
"""
prompt = PromptTemplate(template=template, input_variables=["entity", "news"])
chain = prompt | working_llm
response = chain.invoke({"entity": entity, "news": news_text})
# Extract content from response
response_text = response.content if hasattr(response, 'content') else str(response)
if "Impact:" in response_text and "Reasoning:" in response_text:
impact_part, reasoning_part = response_text.split("Reasoning:")
impact_temp = impact_part.split("Impact:")[1].strip()
# Validate impact category
valid_impacts = [
"Значительный риск убытков",
"Умеренный риск убытков",
"Незначительный риск убытков",
"Вероятность прибыли",
"Неопределенный эффект"
]
if impact_temp in valid_impacts:
impact = impact_temp
reasoning = reasoning_part.strip()
except Exception as e:
st.warning(f"Error in impact estimation: {str(e)}")
return impact, reasoning
class QwenSystem:
def __init__(self):
"""Initialize Qwen 2.5 Coder model"""
try:
self.model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
# Initialize model with auto settings
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype="auto",
device_map="auto"
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
st.success(f"запустил Qwen2.5 model")
except Exception as e:
st.error(f"ошибка запуска Qwen2.5: {str(e)}")
raise
def invoke(self, messages):
"""Process messages using Qwen's chat template"""
try:
# Prepare messages with system prompt
chat_messages = [
{"role": "system", "content": "You are wise financial analyst. You are a helpful assistant."}
]
chat_messages.extend(messages)
# Apply chat template
text = self.tokenizer.apply_chat_template(
chat_messages,
tokenize=False,
add_generation_prompt=True
)
# Prepare model inputs
model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
# Generate response
generated_ids = self.model.generate(
**model_inputs,
max_new_tokens=512,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
# Extract new tokens
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# Decode response
response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Return in ChatOpenAI-compatible format
return type('Response', (), {'content': response})()
except Exception as e:
st.warning(f"Qwen generation error: {str(e)}")
raise
class ProcessingUI:
def __init__(self):
if 'control' not in st.session_state:
st.session_state.control = ProcessControl()
# Initialize processing stats in session state if not exists
if 'processing_stats' not in st.session_state:
st.session_state.processing_stats = {
'start_time': time.time(),
'entities': {},
'events_timeline': [],
'negative_alerts': [],
'processing_speed': []
}
# Create main layout
self.setup_layout()
def setup_layout(self):
"""Setup the main UI layout with tabs and sections"""
# Control Panel
with st.container():
col1, col2, col3 = st.columns([2,2,1])
with col1:
if st.button(
"⏸️ Пауза" if not st.session_state.control.is_paused() else "▶️ Продолжить",
use_container_width=True
):
if st.session_state.control.is_paused():
st.session_state.control.resume()
else:
st.session_state.control.pause()
with col2:
if st.button("⏹️ Остановить", use_container_width=True):
st.session_state.control.stop()
with col3:
self.timer_display = st.empty()
# Progress Bar with custom styling
st.markdown("""
""",
unsafe_allow_html=True
)
self.progress_bar = st.progress(0)
# Create tabs for different views
tab1, tab2, tab3, tab4 = st.tabs([
"📊 Основные метрики",
"🏢 По организациям",
"⚠️ Важные события",
"📈 Аналитика"
])
with tab1:
self.setup_main_metrics_tab()
with tab2:
self.setup_entity_tab()
with tab3:
self.setup_events_tab()
with tab4:
self.setup_analytics_tab()
def setup_main_metrics_tab(self):
"""Setup the main metrics display"""
# Create metrics containers
metrics_cols = st.columns(4)
self.total_processed = metrics_cols[0].empty()
self.negative_count = metrics_cols[1].empty()
self.events_count = metrics_cols[2].empty()
self.speed_metric = metrics_cols[3].empty()
# Recent items container with custom styling
st.markdown("""
""", unsafe_allow_html=True)
self.recent_items_container = st.empty()
def setup_entity_tab(self):
"""Setup the entity-wise analysis display"""
# Entity filter
self.entity_filter = st.multiselect(
"Фильтр по организациям:",
options=[], # Will be populated as entities are processed
default=None
)
# Entity metrics
self.entity_cols = st.columns([2,1,1,1])
self.entity_chart = st.empty()
self.entity_table = st.empty()
def setup_events_tab(self):
"""Setup the events timeline display"""
# Event type filter
self.event_filter = st.multiselect(
"Тип события:",
options=["Отчетность", "РЦБ", "Суд"],
default=None
)
# Timeline container
self.timeline_container = st.container()
def setup_analytics_tab(self):
"""Setup the analytics display"""
# Processing speed chart
self.speed_chart = st.empty()
# Sentiment distribution pie chart
self.sentiment_chart = st.empty()
# Entity correlation matrix
self.correlation_chart = st.empty()
def update_stats(self, row, sentiment, event_type, processing_speed):
"""Update all statistics and displays"""
# Update session state stats
stats = st.session_state.processing_stats
entity = row['Объект']
# Update entity stats
if entity not in stats['entities']:
stats['entities'][entity] = {
'total': 0,
'negative': 0,
'events': 0,
'timeline': []
}
stats['entities'][entity]['total'] += 1
if sentiment == 'Negative':
stats['entities'][entity]['negative'] += 1
if event_type != 'Нет':
stats['entities'][entity]['events'] += 1
# Update processing speed
stats['processing_speed'].append(processing_speed)
# Update UI components
self._update_main_metrics(row, sentiment, event_type, processing_speed)
self._update_entity_view()
self._update_events_view(row, event_type)
self._update_analytics()
def _update_main_metrics(self, row, sentiment, event_type, speed):
"""Update main metrics tab"""
total = sum(e['total'] for e in st.session_state.processing_stats['entities'].values())
total_negative = sum(e['negative'] for e in st.session_state.processing_stats['entities'].values())
total_events = sum(e['events'] for e in st.session_state.processing_stats['entities'].values())
# Update metrics
self.total_processed.metric("Обработано", total)
self.negative_count.metric("Негативных", total_negative)
self.events_count.metric("Событий", total_events)
self.speed_metric.metric("Скорость", f"{speed:.1f} сообщ/сек")
# Update recent items
self._update_recent_items(row, sentiment, event_type)
def _update_recent_items(self, row, sentiment, event_type):
"""Update recent items display with custom styling"""
items_html = "
{row['Объект']}
{row['Заголовок']}
{datetime.now().strftime('%H:%M:%S')}