Spaces:

fschwartzer
/

streamlit_chatbot

Running

App Files Files Community

streamlit_chatbot / app.py

fschwartzer

Update app.py

a09ca43 verified 10 months ago

raw

history blame

8.43 kB

	import streamlit as st
	import pandas as pd
	from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
	from prophet import Prophet

	# Abrindo e lendo o arquivo CSS
	with open("style.css", "r") as css:
	css_style = css.read()

	# Markdown combinado com a importação da fonte e o HTML
	html_content = f"""
	<style>
	{css_style}
	@import url('https://fonts.googleapis.com/css2?family=Kanit:wght@700&display=swap');
	</style>
	<div style='display: flex; flex-direction: column; align-items: flex-start;'>
	<div style='display: flex; align-items: center;'>
	<div style='width: 20px; height: 5px; background-color: green; margin-right: 0px;'></div>
	<div style='width: 20px; height: 5px; background-color: red; margin-right: 0px;'></div>
	<div style='width: 20px; height: 5px; background-color: yellow; margin-right: 18px;'></div>
	<span style='font-size: 38px; font-weight: normal; font-family: "Kanit", sans-serif;'>NOSTRADAMUS</span>
	</div>
	</div>
	"""

	# Aplicar o markdown combinado no Streamlit
	st.markdown(html_content, unsafe_allow_html=True)

	# Inicialização de variáveis de estado
	if 'all_anomalies' not in st.session_state:
	st.session_state['all_anomalies'] = pd.DataFrame()
	if 'history' not in st.session_state:
	st.session_state['history'] = []

	# Carregar os modelos de tradução e TAPEX
	pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
	en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
	tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
	tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
	tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")

	def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
	input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
	outputs = model.generate(input_ids)
	translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return translated_text

	def response(user_question, table_data):
	question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
	encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
	outputs = tapex_model.generate(**encoding)
	response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
	response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
	return response_pt

	def load_data(uploaded_file):
	if uploaded_file.name.endswith('.csv'):
	df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
	elif uploaded_file.name.endswith('.xlsx'):
	df = pd.read_excel(uploaded_file)
	return df

	def preprocess_data(df):
	new_df = df.iloc[2:,9:-1].fillna(0)
	new_df.columns = df.iloc[1,9:-1]
	new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
	month_dict = {
	'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
	'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
	'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
	}

	def convert_column_name(column_name):
	# Check if the column name is 'Rótulos de Linha'
	if column_name == 'Rótulos de Linha':
	return column_name

	# Otherwise, proceed to convert
	parts = column_name.split('/')
	month = parts[0].strip()
	year = parts[1].strip()

	# Clean year in case there are extra characters
	year = ''.join(filter(str.isdigit, year))

	# Get month number from the dictionary
	month_number = month_dict.get(month, '00') # Default '00' if month is not found

	# Return formatted date string
	return f"{month_number}/{year}"

	new_df.columns = [convert_column_name(col) for col in new_df.columns]
	new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
	new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
	df_clean = new_df.copy()
	return df_clean

	def apply_prophet(df_clean):
	if df_clean.empty:
	st.error("DataFrame está vazio após o pré-processamento.")
	return pd.DataFrame()

	# Debugging: Check structure of df_clean
	st.write("Estrutura do DataFrame df_clean:")
	st.write(df_clean)

	# Criar um DataFrame vazio para armazenar todas as anomalias
	all_anomalies = pd.DataFrame()

	# Processar cada linha no DataFrame
	for index, row in df_clean.iterrows():
	# Extract timestamp and value columns
	date_columns = [col for col in df_clean.columns if isinstance(col, pd.Timestamp)]
	data = pd.DataFrame({
	'ds': date_columns,
	'y': row[date_columns].values
	})

	# Debugging: Check the data passed into Prophet
	st.write(f"Dados para Prophet - Grupo {row['Rotulo']}:")
	st.write(data)

	# Remove rows where 'y' is zero or missing
	data = data[data['y'] > 0].dropna().reset_index(drop=True)

	# Ensure there's enough data for Prophet to run
	if data.empty or len(data) < 2:
	st.write(f"Pular grupo {row['Rotulo']} por não ter observações suficientes.")
	continue

	try:
	# Create and fit the Prophet model
	model = Prophet(interval_width=0.95)
	model.fit(data)
	except ValueError as e:
	st.write(f"Pular grupo {row['Rotulo']} devido ao erro: {e}")
	continue

	# Make future predictions
	future = model.make_future_dataframe(periods=12, freq='M')
	forecast = model.predict(future)

	# Add real values and calculate anomalies
	real_values = list(data['y']) + [None] * (len(forecast) - len(data))
	forecast['real'] = real_values
	anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) \| (forecast['real'] > forecast['yhat_upper'])]

	# Debugging: Check the anomalies detected
	st.write(f"Anomalias detectadas para o grupo {row['Rotulo']}:")
	st.write(anomalies)

	# Add group label and append anomalies to all_anomalies DataFrame
	anomalies['group'] = row['Rotulo']
	all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'group']]], ignore_index=True)

	# Return the dataframe of all anomalies
	return all_anomalies

	tab1, tab2 = st.tabs(["Meta Prophet", "Microsoft TAPEX"])

	# Interface para carregar arquivo
	uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])

	with tab1:
	if uploaded_file:
	df = load_data(uploaded_file)
	df_clean = preprocess_data(df)

	if df_clean.empty:
	st.warning("Não há dados válidos para processar.")
	else:
	# Check if 'all_anomalies' is already in session state to avoid re-running Prophet
	if 'all_anomalies' not in st.session_state:
	with st.spinner('Aplicando modelo de série temporal...'):
	all_anomalies = apply_prophet(df_clean)
	st.session_state['all_anomalies'] = all_anomalies

	with tab2:
	# Ensure 'all_anomalies' exists in session state before allowing user interaction
	if 'all_anomalies' in st.session_state and not st.session_state['all_anomalies'].empty:
	# Interface para perguntas do usuário
	user_question = st.text_input("Escreva sua questão aqui:", "")
	if user_question:
	bot_response = response(user_question, st.session_state['all_anomalies'])
	st.session_state['history'].append(('👤', user_question))
	st.session_state['history'].append(('🤖', bot_response))

	# Mostrar histórico de conversa
	for sender, message in st.session_state['history']:
	if sender == '👤':
	st.markdown(f"👤 {message}")
	elif sender == '🤖':
	st.markdown(f"🤖 {message}", unsafe_allow_html=True)

	# Botão para limpar histórico
	if st.button("Limpar histórico"):
	st.session_state['history'] = []
	else:
	st.warning("Por favor, processe os dados no Meta Prophet primeiro.")