armanddemasson commited on
Commit
4af2472
·
2 Parent(s): c3024c3 eae39a6

Merge remote-tracking branch 'origin' into feature/talk_to_data

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. climateqa/chat.py +2 -2
  3. climateqa/constants.py +36 -1
  4. climateqa/logging.py +172 -83
app.py CHANGED
@@ -81,7 +81,7 @@ llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
81
  if os.environ["GRADIO_ENV"] == "local":
82
  reranker = get_reranker("nano")
83
  else:
84
- reranker = get_reranker("large")
85
 
86
  agent = make_graph_agent(
87
  llm=llm,
 
81
  if os.environ["GRADIO_ENV"] == "local":
82
  reranker = get_reranker("nano")
83
  else:
84
+ reranker = get_reranker("nano")
85
 
86
  agent = make_graph_agent(
87
  llm=llm,
climateqa/chat.py CHANGED
@@ -14,7 +14,7 @@ from .handle_stream_events import (
14
  handle_retrieved_owid_graphs,
15
  )
16
  from .logging import (
17
- log_interaction_to_huggingface
18
  )
19
 
20
  # Chat functions
@@ -189,6 +189,6 @@ async def chat_stream(
189
  raise gr.Error(str(e))
190
 
191
  # Call the function to log interaction
192
- log_interaction_to_huggingface(history, output_query, sources, docs, share_client, user_id)
193
 
194
  yield history, docs_html, output_query, output_language, related_contents, graphs_html, follow_up_examples#, vanna_data
 
14
  handle_retrieved_owid_graphs,
15
  )
16
  from .logging import (
17
+ log_interaction
18
  )
19
 
20
  # Chat functions
 
189
  raise gr.Error(str(e))
190
 
191
  # Call the function to log interaction
192
+ log_interaction(history, output_query, sources, docs, share_client, user_id)
193
 
194
  yield history, docs_html, output_query, output_language, related_contents, graphs_html, follow_up_examples#, vanna_data
climateqa/constants.py CHANGED
@@ -65,4 +65,39 @@ OWID_CATEGORIES = ['Access to Energy', 'Agricultural Production',
65
  'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
66
  'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
67
  'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
68
- 'Water Use & Stress', 'Wildfires']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
66
  'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
67
  'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
68
+ 'Water Use & Stress', 'Wildfires']
69
+
70
+
71
+ DOCUMENT_METADATA_DEFAULT_VALUES = {
72
+ "chunk_type": "",
73
+ "document_id": "",
74
+ "document_number": 0.0,
75
+ "element_id": "",
76
+ "figure_code": "",
77
+ "file_size": "",
78
+ "image_path": "",
79
+ "n_pages": 0.0,
80
+ "name": "",
81
+ "num_characters": 0.0,
82
+ "num_tokens": 0.0,
83
+ "num_tokens_approx": 0.0,
84
+ "num_words": 0.0,
85
+ "page_number": 0,
86
+ "release_date": 0.0,
87
+ "report_type": "",
88
+ "section_header": "",
89
+ "short_name": "",
90
+ "source": "",
91
+ "toc_level0": "",
92
+ "toc_level1": "",
93
+ "toc_level2": "",
94
+ "toc_level3": "",
95
+ "url": "",
96
+ "similarity_score": 0.0,
97
+ "content": "",
98
+ "reranking_score": 0.0,
99
+ "query_used_for_retrieval": "",
100
+ "sources_used": [""],
101
+ "question_used": "",
102
+ "index_used": ""
103
+ }
climateqa/logging.py CHANGED
@@ -4,111 +4,157 @@ import json
4
  from huggingface_hub import HfApi
5
  import gradio as gr
6
  import csv
 
 
 
 
 
7
 
8
- def serialize_docs(docs:list)->list:
 
 
 
 
 
 
 
 
 
 
 
 
9
  new_docs = []
10
  for doc in docs:
11
- new_doc = {}
12
- new_doc["page_content"] = doc.page_content
13
- new_doc["metadata"] = doc.metadata
 
 
 
 
 
 
 
 
 
 
 
14
  new_docs.append(new_doc)
 
 
 
15
  return new_docs
16
 
17
  ## AZURE LOGGING - DEPRECATED
18
 
19
- # def log_on_azure(file, logs, share_client):
20
- # """Log data to Azure Blob Storage.
21
 
22
- # Args:
23
- # file (str): Name of the file to store logs
24
- # logs (dict): Log data to store
25
- # share_client: Azure share client instance
26
- # """
27
- # logs = json.dumps(logs)
28
- # file_client = share_client.get_file_client(file)
29
- # file_client.upload_file(logs)
30
 
31
 
32
- # def log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id):
33
- # """Log chat interaction to Azure and Hugging Face.
34
 
35
- # Args:
36
- # history (list): Chat message history
37
- # output_query (str): Processed query
38
- # sources (list): Knowledge base sources used
39
- # docs (list): Retrieved documents
40
- # share_client: Azure share client instance
41
- # user_id (str): User identifier
42
- # """
43
- # try:
44
- # # Log interaction to Azure if not in local environment
45
- # if os.getenv("GRADIO_ENV") != "local":
46
- # timestamp = str(datetime.now().timestamp())
47
- # prompt = history[1]["content"]
48
- # logs = {
49
- # "user_id": str(user_id),
50
- # "prompt": prompt,
51
- # "query": prompt,
52
- # "question": output_query,
53
- # "sources": sources,
54
- # "docs": serialize_docs(docs),
55
- # "answer": history[-1].content,
56
- # "time": timestamp,
57
- # }
58
- # # Log to Azure
59
- # log_on_azure(f"{timestamp}.json", logs, share_client)
60
- # except Exception as e:
61
- # print(f"Error logging on Azure Blob Storage: {e}")
62
- # error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
63
- # raise gr.Error(error_msg)
64
 
65
- # def log_drias_interaction_to_azure(query, sql_query, data, share_client, user_id):
66
- # """Log Drias data interaction to Azure and Hugging Face.
67
 
68
- # Args:
69
- # query (str): User query
70
- # sql_query (str): SQL query used
71
- # data: Retrieved data
72
- # share_client: Azure share client instance
73
- # user_id (str): User identifier
74
- # """
75
- # try:
76
- # # Log interaction to Azure if not in local environment
77
- # if os.getenv("GRADIO_ENV") != "local":
78
- # timestamp = str(datetime.now().timestamp())
79
- # logs = {
80
- # "user_id": str(user_id),
81
- # "query": query,
82
- # "sql_query": sql_query,
83
- # "time": timestamp,
84
- # }
85
- # log_on_azure(f"drias_{timestamp}.json", logs, share_client)
86
- # print(f"Logged Drias interaction to Azure Blob Storage: {logs}")
87
- # else:
88
- # print("share_client or user_id is None, or GRADIO_ENV is local")
89
- # except Exception as e:
90
- # print(f"Error logging Drias interaction on Azure Blob Storage: {e}")
91
- # error_msg = f"Drias Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
92
- # raise gr.Error(error_msg)
93
 
94
  ## HUGGING FACE LOGGING
95
 
96
- def log_on_huggingface(log_filename, logs):
97
  """Log data to Hugging Face dataset repository.
98
 
99
  Args:
100
  log_filename (str): Name of the file to store logs
101
  logs (dict): Log data to store
 
102
  """
103
  try:
104
- # Get Hugging Face token from environment
105
- hf_token = os.getenv("HF_LOGS_TOKEN")
106
- if not hf_token:
107
- print("HF_LOGS_TOKEN not found in environment variables")
108
- return
 
109
 
110
- # Get repository name from environment or use default
111
- repo_id = os.getenv("HF_DATASET_REPO", "timeki/climateqa_logs")
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  # Initialize HfApi
114
  api = HfApi(token=hf_token)
@@ -158,10 +204,13 @@ def log_interaction_to_huggingface(history, output_query, sources, docs, share_c
158
  "time": timestamp,
159
  }
160
  # Log to Hugging Face
161
- log_on_huggingface(f"chat/{timestamp}.json", logs)
 
 
 
162
  except Exception as e:
163
  print(f"Error logging to Hugging Face: {e}")
164
- error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
165
  raise gr.Error(error_msg)
166
 
167
  def log_drias_interaction_to_huggingface(query, sql_query, user_id):
@@ -182,7 +231,7 @@ def log_drias_interaction_to_huggingface(query, sql_query, user_id):
182
  "sql_query": sql_query,
183
  "time": timestamp,
184
  }
185
- log_on_huggingface(f"drias/drias_{timestamp}.json", logs)
186
  print(f"Logged Drias interaction to Hugging Face: {logs}")
187
  else:
188
  print("share_client or user_id is None, or GRADIO_ENV is local")
@@ -191,4 +240,44 @@ def log_drias_interaction_to_huggingface(query, sql_query, user_id):
191
  error_msg = f"Drias Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
192
  raise gr.Error(error_msg)
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
 
4
  from huggingface_hub import HfApi
5
  import gradio as gr
6
  import csv
7
+ import pandas as pd
8
+ import io
9
+ from typing import TypedDict, List
10
+ from climateqa.constants import DOCUMENT_METADATA_DEFAULT_VALUES
11
+ from langchain_core.documents import Document
12
 
13
+ def serialize_docs(docs:list[Document])->list:
14
+ """Convert document objects to a simplified format compatible with Hugging Face datasets.
15
+
16
+ This function processes document objects by extracting their page content and metadata,
17
+ normalizing the metadata structure to ensure consistency. It applies default values
18
+ from DOCUMENT_METADATA_DEFAULT_VALUES for any missing metadata fields.
19
+
20
+ Args:
21
+ docs (list): List of document objects, each with page_content and metadata attributes
22
+
23
+ Returns:
24
+ list: List of dictionaries with standardized "page_content" and "metadata" fields
25
+ """
26
  new_docs = []
27
  for doc in docs:
28
+ # Make sure we have a clean doc format
29
+ new_doc = {
30
+ "page_content": doc.page_content,
31
+ "metadata": {}
32
+ }
33
+
34
+ # Ensure all metadata fields exist with defaults if missing
35
+ for field, default_value in DOCUMENT_METADATA_DEFAULT_VALUES.items():
36
+ new_value = doc.metadata.get(field, default_value)
37
+ try:
38
+ new_doc["metadata"][field] = type(default_value)(new_value)
39
+ except:
40
+ new_doc["metadata"][field] = default_value
41
+
42
  new_docs.append(new_doc)
43
+
44
+ if new_docs == []:
45
+ new_docs = [{"page_content": "No documents found", "metadata": DOCUMENT_METADATA_DEFAULT_VALUES}]
46
  return new_docs
47
 
48
  ## AZURE LOGGING - DEPRECATED
49
 
50
+ def log_on_azure(file, logs, share_client):
51
+ """Log data to Azure Blob Storage.
52
 
53
+ Args:
54
+ file (str): Name of the file to store logs
55
+ logs (dict): Log data to store
56
+ share_client: Azure share client instance
57
+ """
58
+ logs = json.dumps(logs)
59
+ file_client = share_client.get_file_client(file)
60
+ file_client.upload_file(logs)
61
 
62
 
63
+ def log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id):
64
+ """Log chat interaction to Azure and Hugging Face.
65
 
66
+ Args:
67
+ history (list): Chat message history
68
+ output_query (str): Processed query
69
+ sources (list): Knowledge base sources used
70
+ docs (list): Retrieved documents
71
+ share_client: Azure share client instance
72
+ user_id (str): User identifier
73
+ """
74
+ try:
75
+ # Log interaction to Azure if not in local environment
76
+ if os.getenv("GRADIO_ENV") != "local":
77
+ timestamp = str(datetime.now().timestamp())
78
+ prompt = history[1]["content"]
79
+ logs = {
80
+ "user_id": str(user_id),
81
+ "prompt": prompt,
82
+ "query": prompt,
83
+ "question": output_query,
84
+ "sources": sources,
85
+ "docs": serialize_docs(docs),
86
+ "answer": history[-1].content,
87
+ "time": timestamp,
88
+ }
89
+ # Log to Azure
90
+ log_on_azure(f"{timestamp}.json", logs, share_client)
91
+ except Exception as e:
92
+ print(f"Error logging on Azure Blob Storage: {e}")
93
+ error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
94
+ raise gr.Error(error_msg)
95
 
96
+ def log_drias_interaction_to_azure(query, sql_query, data, share_client, user_id):
97
+ """Log Drias data interaction to Azure and Hugging Face.
98
 
99
+ Args:
100
+ query (str): User query
101
+ sql_query (str): SQL query used
102
+ data: Retrieved data
103
+ share_client: Azure share client instance
104
+ user_id (str): User identifier
105
+ """
106
+ try:
107
+ # Log interaction to Azure if not in local environment
108
+ if os.getenv("GRADIO_ENV") != "local":
109
+ timestamp = str(datetime.now().timestamp())
110
+ logs = {
111
+ "user_id": str(user_id),
112
+ "query": query,
113
+ "sql_query": sql_query,
114
+ "time": timestamp,
115
+ }
116
+ log_on_azure(f"drias_{timestamp}.json", logs, share_client)
117
+ print(f"Logged Drias interaction to Azure Blob Storage: {logs}")
118
+ else:
119
+ print("share_client or user_id is None, or GRADIO_ENV is local")
120
+ except Exception as e:
121
+ print(f"Error logging Drias interaction on Azure Blob Storage: {e}")
122
+ error_msg = f"Drias Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
123
+ raise gr.Error(error_msg)
124
 
125
  ## HUGGING FACE LOGGING
126
 
127
+ def log_on_huggingface(log_filename, logs, log_type="chat"):
128
  """Log data to Hugging Face dataset repository.
129
 
130
  Args:
131
  log_filename (str): Name of the file to store logs
132
  logs (dict): Log data to store
133
+ log_type (str): Type of log to store
134
  """
135
  try:
136
+ if log_type =="chat":
137
+ # Get Hugging Face token from environment
138
+ hf_token = os.getenv("HF_LOGS_TOKEN")
139
+ if not hf_token:
140
+ print("HF_LOGS_TOKEN not found in environment variables")
141
+ return
142
 
143
+ # Get repository name from environment or use default
144
+ repo_id = os.getenv("HF_DATASET_REPO", "Ekimetrics/climateqa_logs")
145
+
146
+ elif log_type =="drias":
147
+ # Get Hugging Face token from environment
148
+ hf_token = os.getenv("HF_LOGS_DRIAS_TOKEN")
149
+ if not hf_token:
150
+ print("HF_LOGS_DRIAS_TOKEN not found in environment variables")
151
+ return
152
+
153
+ # Get repository name from environment or use default
154
+ repo_id = os.getenv("HF_DATASET_REPO_DRIAS", "Ekimetrics/climateqa_logs_talk_to_data")
155
+
156
+ else:
157
+ raise ValueError(f"Invalid log type: {log_type}")
158
 
159
  # Initialize HfApi
160
  api = HfApi(token=hf_token)
 
204
  "time": timestamp,
205
  }
206
  # Log to Hugging Face
207
+ log_on_huggingface(f"chat/{timestamp}.json", logs, log_type="chat")
208
+ print(f"Logged interaction to Hugging Face")
209
+ else:
210
+ print("Did not log to Hugging Face because GRADIO_ENV is local")
211
  except Exception as e:
212
  print(f"Error logging to Hugging Face: {e}")
213
+ error_msg = f"ClimateQ&A Error: {str(e)[:100]})"
214
  raise gr.Error(error_msg)
215
 
216
  def log_drias_interaction_to_huggingface(query, sql_query, user_id):
 
231
  "sql_query": sql_query,
232
  "time": timestamp,
233
  }
234
+ log_on_huggingface(f"drias/drias_{timestamp}.json", logs, log_type="drias")
235
  print(f"Logged Drias interaction to Hugging Face: {logs}")
236
  else:
237
  print("share_client or user_id is None, or GRADIO_ENV is local")
 
240
  error_msg = f"Drias Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
241
  raise gr.Error(error_msg)
242
 
243
+ def log_interaction(history, output_query, sources, docs, share_client, user_id):
244
+ """Log chat interaction to Hugging Face, and fall back to Azure if that fails.
245
+
246
+ Args:
247
+ history (list): Chat message history
248
+ output_query (str): Processed query
249
+ sources (list): Knowledge base sources used
250
+ docs (list): Retrieved documents
251
+ share_client: Azure share client instance
252
+ user_id (str): User identifier
253
+ """
254
+ try:
255
+ # First try to log to Hugging Face
256
+ log_interaction_to_huggingface(history, output_query, sources, docs, share_client, user_id)
257
+ except Exception as e:
258
+ print(f"Failed to log to Hugging Face, falling back to Azure: {e}")
259
+ try:
260
+ # Fall back to Azure logging
261
+ if os.getenv("GRADIO_ENV") != "local":
262
+ timestamp = str(datetime.now().timestamp())
263
+ prompt = history[1]["content"]
264
+ logs = {
265
+ "user_id": str(user_id),
266
+ "prompt": prompt,
267
+ "query": prompt,
268
+ "question": output_query,
269
+ "sources": sources,
270
+ "docs": serialize_docs(docs),
271
+ "answer": history[-1].content,
272
+ "time": timestamp,
273
+ }
274
+ # Log to Azure
275
+ log_on_azure(f"{timestamp}.json", logs, share_client)
276
+ print("Successfully logged to Azure as fallback")
277
+ except Exception as azure_error:
278
+ print(f"Error in Azure fallback logging: {azure_error}")
279
+ error_msg = f"ClimateQ&A Logging Error: {str(azure_error)[:100]})"
280
+ # Don't raise error to avoid disrupting user experience
281
+ print(error_msg)
282
+
283