File size: 8,433 Bytes
e2af017
 
 
96bff79
e2af017
e727bfc
e69523f
e727bfc
 
 
 
63c894a
e727bfc
535c2d9
63c894a
535c2d9
 
b375123
 
 
 
f2de8aa
96bff79
e727bfc
 
 
 
 
3f82507
 
 
 
 
 
525bf5b
c8e42cd
 
 
 
525bf5b
c8e42cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525bf5b
 
 
 
 
 
c8e42cd
525bf5b
138e488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378a4bc
 
e69523f
 
 
 
a09ca43
 
 
 
378a4bc
 
a09ca43
378a4bc
 
a09ca43
 
138e488
a09ca43
 
138e488
a09ca43
 
 
 
 
 
 
 
 
138e488
a09ca43
138e488
a09ca43
138e488
a09ca43
 
138e488
 
a09ca43
138e488
a09ca43
138e488
 
 
a09ca43
 
 
138e488
 
a09ca43
 
 
 
 
 
138e488
 
378a4bc
a09ca43
378a4bc
c8e42cd
b375123
 
525bf5b
 
b375123
 
 
 
 
edfa911
b375123
 
 
edfa911
 
 
 
 
b375123
 
edfa911
 
 
 
 
b375123
 
 
edfa911
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet

# Abrindo e lendo o arquivo CSS
with open("style.css", "r") as css:
    css_style = css.read()

# Markdown combinado com a importação da fonte e o HTML
html_content = f"""
<style>
{css_style}
@import url('https://fonts.googleapis.com/css2?family=Kanit:wght@700&display=swap');
</style>
<div style='display: flex; flex-direction: column; align-items: flex-start;'>
    <div style='display: flex; align-items: center;'>
        <div style='width: 20px; height: 5px; background-color: green; margin-right: 0px;'></div>
        <div style='width: 20px; height: 5px; background-color: red; margin-right: 0px;'></div>
        <div style='width: 20px; height: 5px; background-color: yellow; margin-right: 18px;'></div>
        <span style='font-size: 38px; font-weight: normal; font-family: "Kanit", sans-serif;'>NOSTRADAMUS</span>
    </div>
</div>
"""

# Aplicar o markdown combinado no Streamlit
st.markdown(html_content, unsafe_allow_html=True)

# Inicialização de variáveis de estado
if 'all_anomalies' not in st.session_state:
    st.session_state['all_anomalies'] = pd.DataFrame()
if 'history' not in st.session_state:
    st.session_state['history'] = []

# Carregar os modelos de tradução e TAPEX
pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")

def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
    input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
    outputs = model.generate(input_ids)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

def response(user_question, table_data):
    question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
    encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
    outputs = tapex_model.generate(**encoding)
    response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
    return response_pt

def load_data(uploaded_file):
    if uploaded_file.name.endswith('.csv'):
        df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
    elif uploaded_file.name.endswith('.xlsx'):
        df = pd.read_excel(uploaded_file)
    return df

def preprocess_data(df):
    new_df = df.iloc[2:,9:-1].fillna(0)
    new_df.columns = df.iloc[1,9:-1]
    new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
    month_dict = {
        'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
        'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
        'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
    }
    
    def convert_column_name(column_name):
        # Check if the column name is 'Rótulos de Linha'
        if column_name == 'Rótulos de Linha':
            return column_name
    
        # Otherwise, proceed to convert
        parts = column_name.split('/')
        month = parts[0].strip()
        year = parts[1].strip()
    
        # Clean year in case there are extra characters
        year = ''.join(filter(str.isdigit, year))
    
        # Get month number from the dictionary
        month_number = month_dict.get(month, '00')  # Default '00' if month is not found
    
        # Return formatted date string
        return f"{month_number}/{year}"

    new_df.columns = [convert_column_name(col) for col in new_df.columns]
    new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
    new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
    df_clean = new_df.copy()
    return df_clean

def apply_prophet(df_clean):
    if df_clean.empty:
        st.error("DataFrame está vazio após o pré-processamento.")
        return pd.DataFrame()

    # Debugging: Check structure of df_clean
    st.write("Estrutura do DataFrame df_clean:")
    st.write(df_clean)

    # Criar um DataFrame vazio para armazenar todas as anomalias
    all_anomalies = pd.DataFrame()
    
    # Processar cada linha no DataFrame
    for index, row in df_clean.iterrows():
        # Extract timestamp and value columns
        date_columns = [col for col in df_clean.columns if isinstance(col, pd.Timestamp)]
        data = pd.DataFrame({
            'ds': date_columns,
            'y': row[date_columns].values
        })

        # Debugging: Check the data passed into Prophet
        st.write(f"Dados para Prophet - Grupo {row['Rotulo']}:")
        st.write(data)

        # Remove rows where 'y' is zero or missing
        data = data[data['y'] > 0].dropna().reset_index(drop=True)

        # Ensure there's enough data for Prophet to run
        if data.empty or len(data) < 2:
            st.write(f"Pular grupo {row['Rotulo']} por não ter observações suficientes.")
            continue

        try:
            # Create and fit the Prophet model
            model = Prophet(interval_width=0.95)
            model.fit(data)
        except ValueError as e:
            st.write(f"Pular grupo {row['Rotulo']} devido ao erro: {e}")
            continue

        # Make future predictions
        future = model.make_future_dataframe(periods=12, freq='M')
        forecast = model.predict(future)

        # Add real values and calculate anomalies
        real_values = list(data['y']) + [None] * (len(forecast) - len(data))
        forecast['real'] = real_values
        anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]

        # Debugging: Check the anomalies detected
        st.write(f"Anomalias detectadas para o grupo {row['Rotulo']}:")
        st.write(anomalies)

        # Add group label and append anomalies to all_anomalies DataFrame
        anomalies['group'] = row['Rotulo']
        all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'group']]], ignore_index=True)

    # Return the dataframe of all anomalies
    return all_anomalies

tab1, tab2 = st.tabs(["Meta Prophet", "Microsoft TAPEX"])

# Interface para carregar arquivo
uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])

with tab1:
    if uploaded_file:
        df = load_data(uploaded_file)
        df_clean = preprocess_data(df)
        
        if df_clean.empty:
            st.warning("Não há dados válidos para processar.")
        else:
            # Check if 'all_anomalies' is already in session state to avoid re-running Prophet
            if 'all_anomalies' not in st.session_state:
                with st.spinner('Aplicando modelo de série temporal...'):
                    all_anomalies = apply_prophet(df_clean)
                    st.session_state['all_anomalies'] = all_anomalies

with tab2:
    # Ensure 'all_anomalies' exists in session state before allowing user interaction
    if 'all_anomalies' in st.session_state and not st.session_state['all_anomalies'].empty:
        # Interface para perguntas do usuário
        user_question = st.text_input("Escreva sua questão aqui:", "")
        if user_question:
            bot_response = response(user_question, st.session_state['all_anomalies'])
            st.session_state['history'].append(('👤', user_question))
            st.session_state['history'].append(('🤖', bot_response))
        
        # Mostrar histórico de conversa
        for sender, message in st.session_state['history']:
            if sender == '👤':
                st.markdown(f"**👤 {message}**")
            elif sender == '🤖':
                st.markdown(f"**🤖 {message}**", unsafe_allow_html=True)
        
        # Botão para limpar histórico
        if st.button("Limpar histórico"):
            st.session_state['history'] = []
    else:
        st.warning("Por favor, processe os dados no Meta Prophet primeiro.")