Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

armanddemasson commited on May 21

Commit

c0fd277

1 Parent(s): c25f6b1

feat: updated common talk to data for talk to ipcc and drias

Browse files

Files changed (6) hide show

climateqa/engine/talk_to_data/input_processing.py +74 -34
climateqa/engine/talk_to_data/objects/location.py +5 -0
climateqa/engine/talk_to_data/objects/plot.py +3 -2
climateqa/engine/talk_to_data/prompt.py +44 -0
climateqa/engine/talk_to_data/query.py +7 -2
style.css +43 -11

climateqa/engine/talk_to_data/input_processing.py CHANGED Viewed

@@ -1,16 +1,18 @@
-from typing import Any
 import ast
 from langchain_core.prompts import ChatPromptTemplate
 from geopy.geocoders import Nominatim
 from climateqa.engine.llm import get_llm
 import duckdb
 from climateqa.engine.talk_to_data.objects.llm_outputs import ArrayOutput
 from climateqa.engine.talk_to_data.objects.location import Location
 from climateqa.engine.talk_to_data.objects.plot import Plot
 from climateqa.engine.talk_to_data.objects.states import State
-async def detect_location_with_openai(sentence):
     """
     Detects locations in a sentence using OpenAI's API via LangChain.
     """
@@ -49,21 +51,51 @@ def loc_to_coords(location: str) -> tuple[float, float]:
     coords = geolocator.geocode(location)
     return (coords.latitude, coords.longitude)
-def nearest_neighbour_sql(location: tuple, table: str) -> tuple[str, str]:
     long = round(location[1], 3)
     lat = round(location[0], 3)
-    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
-    results = duckdb.sql(
-        f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}"
-    ).fetchdf()
     if len(results) == 0:
-        return "", ""
-    # cursor.execute(f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}")
-    return results['latitude'].iloc[0], results['longitude'].iloc[0]
 async def detect_year_with_openai(sentence: str) -> str:
     """
@@ -136,43 +168,49 @@ async def detect_relevant_plots(user_question: str, llm, plot_list: list[Plot])
         plots_description += " - Description: " + plot["description"] + "\n"
     prompt = (
-        f"You are helping to answer a quesiton with insightful visualizations."
-        f"You are given an user question and a list of plots with their name and description."
-        f"Based on the descriptions of the plots, which plot is appropriate to answer to this question."
-        f"Write the most relevant tables to use. Answer only a python list of plot name."
         f"### Descriptions of the plots : {plots_description}"
-        f"### User question : {user_question}"
-        f"### Name of the plot : "
     )
-    # prompt = (
-    #     f"You are helping to answer a question with insightful visualizations. "
-    #     f"Given a list of plots with their name and description: "
-    #     f"{plots_description} "
-    #     f"The user question is: {user_question}. "
-    #     f"Choose the most relevant plots to answer the question. "
-    #     f"The answer must be a Python list with the names of the relevant plots, and nothing else. "
-    #     f"Ensure the response is in the exact format: ['PlotName1', 'PlotName2']."
-    # )
     plot_names = ast.literal_eval(
         (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
     )
     return plot_names
-async def find_location(user_input: str, table: str) -> Location:
-    print(f"---- Find location in table {table} ----")
     location = await detect_location_with_openai(user_input)
-    output: Location = {'location' : location}
     if location:
         coords = loc_to_coords(location)
-        neighbour = nearest_neighbour_sql(coords, table)
         output.update({
             "latitude": neighbour[0],
             "longitude": neighbour[1],
         })
     return output
-async def find_year(user_input: str) -> str:
     """Extracts year information from user input using LLM.
     This function uses an LLM to identify and extract year information from the
@@ -186,6 +224,8 @@ async def find_year(user_input: str) -> str:
     """
     print(f"---- Find year ---")
     year = await detect_year_with_openai(user_input)
     return year
 async def find_relevant_plots(state: State, llm, plots: list[Plot]) -> list[str]:
@@ -198,7 +238,7 @@ async def find_relevant_tables_per_plot(state: State, plot: Plot, llm, tables: l
     relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm, tables)
     return relevant_tables
-async def find_param(state: State, param_name:str, table: str) -> dict[str, Any] | None:
     """Perform the good method to retrieve the desired parameter
     Args:
@@ -210,7 +250,7 @@ async def find_param(state: State, param_name:str, table: str) -> dict[str, Any]
         dict[str, Any] | None:
     """
     if param_name == 'location':
-        location = await find_location(state['user_input'], table)
         return location
     if param_name == 'year':
         year = await find_year(state['user_input'])

+from typing import Any, Literal, Optional, cast
 import ast
 from langchain_core.prompts import ChatPromptTemplate
 from geopy.geocoders import Nominatim
 from climateqa.engine.llm import get_llm
 import duckdb
+import os
+from climateqa.engine.talk_to_data.ipcc.config import IPCC_DATASET_URL
 from climateqa.engine.talk_to_data.objects.llm_outputs import ArrayOutput
 from climateqa.engine.talk_to_data.objects.location import Location
 from climateqa.engine.talk_to_data.objects.plot import Plot
 from climateqa.engine.talk_to_data.objects.states import State
+import time
+async def detect_location_with_openai(sentence: str) -> str:
     """
     Detects locations in a sentence using OpenAI's API via LangChain.
     """
     coords = geolocator.geocode(location)
     return (coords.latitude, coords.longitude)
+def coords_to_country(coords: tuple[float, float]) -> tuple[str,str]:
+    """Converts geographic coordinates to a country name.
+    This function uses the Nominatim reverse geocoding service to convert
+    latitude and longitude coordinates to a country name.
+    Args:
+        coords (tuple[float, float]): A tuple containing (latitude, longitude)
+    Returns:
+        tuple[str,str]: A tuple containg (country_code, country_name, admin1)
+    Raises:
+        AttributeError: If the coordinates cannot be found
+    """
+    geolocator = Nominatim(user_agent="latlong_to_country")
+    location = geolocator.reverse(coords)
+    address = location.raw['address']
+    return address['country_code'].upper(), address['country']
+def nearest_neighbour_sql(location: tuple, mode: Literal['DRIAS', 'IPCC']) -> tuple[str, str, Optional[str]]:
     long = round(location[1], 3)
     lat = round(location[0], 3)
+    conn = duckdb.connect()
+    if mode == 'DRIAS':
+        table_path = f"'hf://datasets/timeki/drias_db/mean_annual_temperature.parquet'"
+        results = conn.sql(
+            f"SELECT latitude, longitude FROM {table_path} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}"
+        ).fetchdf()
+    else:
+        table_path = f"'{IPCC_DATASET_URL}/coordinates.parquet'"
+        results = conn.sql(
+            f"SELECT latitude, longitude, admin1 FROM {table_path} WHERE latitude BETWEEN {lat - 0.5} AND {lat + 0.5} AND longitude BETWEEN {long - 0.5} AND {long + 0.5}"
+        ).fetchdf()
     if len(results) == 0:
+        return "", "", ""
+    if 'admin1' in results.columns:
+        admin1 = results['admin1'].iloc[0]
+    else:
+        admin1 = None
+    return results['latitude'].iloc[0], results['longitude'].iloc[0], admin1
 async def detect_year_with_openai(sentence: str) -> str:
     """
         plots_description += " - Description: " + plot["description"] + "\n"
     prompt = (
+        "You are helping to answer a question with insightful visualizations.\n"
+        "You are given a user question and a list of plots with their name and description.\n"
+        "Based on the descriptions of the plots, select ALL plots that could provide a useful answer to this question. "
+        "Include any plot that could show relevant information, even if their perspectives (such as time series or spatial distribution) are different.\n"
+        "For example, for a question like 'What will be the total rainfall in China in 2050?', both a time series plot and a spatial map plot could be relevant.\n"
+        "Return only a Python list of plot names sorted from the most relevant one to the less relevant one.\n"
         f"### Descriptions of the plots : {plots_description}"
+        f"### User question : {user_question}\n"
+        f"### Names of the plots : "
     )
     plot_names = ast.literal_eval(
         (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
     )
     return plot_names
+async def find_location(user_input: str, mode: Literal['DRIAS', 'IPCC'] = 'DRIAS') -> Location:
+    print(f"---- Find location in user input ----")
     location = await detect_location_with_openai(user_input)
+    output: Location = {
+        'location' : location,
+        'longitude' : None,
+        'latitude' : None,
+        'country_code' : None,
+        'country_name' : None,
+        'admin1' : None
+        }
     if location:
         coords = loc_to_coords(location)
+        country_code, country_name = coords_to_country(coords)
+        neighbour = nearest_neighbour_sql(coords, mode)
         output.update({
             "latitude": neighbour[0],
             "longitude": neighbour[1],
+            "country_code": country_code,
+            "country_name": country_name,
+            "admin1": neighbour[2]
         })
+    output = cast(Location, output)
     return output
+async def find_year(user_input: str) -> str| None:
     """Extracts year information from user input using LLM.
     This function uses an LLM to identify and extract year information from the
     """
     print(f"---- Find year ---")
     year = await detect_year_with_openai(user_input)
+    if year == "":
+        return None
     return year
 async def find_relevant_plots(state: State, llm, plots: list[Plot]) -> list[str]:
     relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm, tables)
     return relevant_tables
+async def find_param(state: State, param_name:str, mode: Literal['DRIAS', 'IPCC'] = 'DRIAS') -> dict[str, Optional[str]] | Location | None:
     """Perform the good method to retrieve the desired parameter
     Args:
         dict[str, Any] | None:
     """
     if param_name == 'location':
+        location = await find_location(state['user_input'], mode)
         return location
     if param_name == 'year':
         year = await find_year(state['user_input'])

climateqa/engine/talk_to_data/objects/location.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from typing import Optional, TypedDict
 class Location(TypedDict):
     location: str
     latitude: Optional[str]
     longitude: Optional[str]

+from token import OP
 from typing import Optional, TypedDict
 class Location(TypedDict):
     location: str
     latitude: Optional[str]
     longitude: Optional[str]
+    country_code: Optional[str]
+    country_name: Optional[str]
+    admin1: Optional[str]

climateqa/engine/talk_to_data/objects/plot.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, TypedDict
 from plotly.graph_objects import Figure
 class Plot(TypedDict):
@@ -18,4 +18,5 @@ class Plot(TypedDict):
     description: str
     params: list[str]
     plot_function: Callable[..., Callable[..., Figure]]
-    sql_query: Callable[..., str]

+from typing import Callable, TypedDict, Optional
 from plotly.graph_objects import Figure
 class Plot(TypedDict):
     description: str
     params: list[str]
     plot_function: Callable[..., Callable[..., Figure]]
+    sql_query: Callable[..., str]
+    short_name: str

climateqa/engine/talk_to_data/prompt.py ADDED Viewed

	@@ -0,0 +1,44 @@

+query_prompt_template = """You are an expert SQL query generator. Given an input question, database schema, SQL dialect and relevant tables to answer the question, generate an optimized and syntactically correct SQL query which can provide useful insights to the question.
+### Instructions:
+1. **Use only relevant tables**: The following tables are relevant to answering the question: {relevant_tables}. Do not use any other tables.
+2. **Relevant columns only**: Never select `*`. Only include necessary columns based on the input question.
+3. **Schema Awareness**:
+   - Use only columns present in the given schema.
+   - **If a column name appears in multiple tables, always use the format `table_name.column_name` to avoid ambiguity.**
+   - Select only the column which are insightful for the question.
+4. **Dialect Compliance**: Follow `{dialect}` syntax rules.
+5. **Ordering**: Order the results by a relevant column if applicable (e.g., timestamp for recent records).
+6. **Valid query**: Make sure the query is syntactically and functionally correct.
+7. **Conditions** : For the common columns, the same condition should be applied to all the tables (e.g. latitude, longitude, model, year...)
+9. **Join tables** : If you need to join table, you should join them with year feature.
+10. **Model** : For each table, you need to add a condition on the model to be equal to {model}
+### Provided Database Schema:
+{table_info}
+### Relevant Tables:
+{relevant_tables}
+**Question:** {input}
+**SQL Query:**"""
+plot_prompt_template = """You are a data visualization expert. Given an input question and an SQL Query, generate an insightful plot according to the question.
+### Instructions
+1. **Use only the column names provided**. The data will be provided as a Pandas DataFrame `df` with the columns present in the SELECT.
+2. Generate the Python Plotly code to chart the results using `df` and the column names.
+3. Make as complete a graph as possible to answer the question, and make it as easy to understand as possible.
+4. **Response with only Python code**. Do not answer with any explanations -- just the code.
+5. **Specific cases** :
+- For a question about the evolution of something, it is also relevant to plot the data with also the sliding average for a period of 20 years for example.
+### SQL Query:
+{sql_query}
+**Question:** {input}
+**Python code:**
+"""

climateqa/engine/talk_to_data/query.py CHANGED Viewed

@@ -2,7 +2,7 @@ import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import duckdb
 import pandas as pd
 def find_indicator_column(table: str, indicator_columns_per_table: dict[str,str]) -> str:
     """Retrieves the name of the indicator column within a table.
@@ -41,7 +41,12 @@ async def execute_sql_query(sql_query: str) -> pd.DataFrame:
     def _execute_query():
         # Execute the query
         con = duckdb.connect()
-        results = con.sql(sql_query).fetchdf()
         # return fetched data
         return results

 from concurrent.futures import ThreadPoolExecutor
 import duckdb
 import pandas as pd
+import os
 def find_indicator_column(table: str, indicator_columns_per_table: dict[str,str]) -> str:
     """Retrieves the name of the indicator column within a table.
     def _execute_query():
         # Execute the query
         con = duckdb.connect()
+        HF_TOKEN = os.getenv("HF_TOKEN")
+        con.execute(f"""CREATE SECRET hf_token (
+            TYPE huggingface,
+            TOKEN '{HF_TOKEN}'
+        );""")
+        results = con.execute(sql_query).fetchdf()
         # return fetched data
         return results

style.css CHANGED Viewed

@@ -656,12 +656,11 @@ a {
     /* overflow-y: scroll; */
 }
 #sql-query{
-    max-height: 300px;
-    overflow-y:scroll;
 }
 #sql-query textarea{
-    min-height: 100px !important;
 }
 #sql-query span{
@@ -671,8 +670,11 @@ div#tab-vanna{
     max-height: 100¨vh;
     overflow-y: hidden;
 }
 #vanna-plot{
-    max-height:500px
 }
 #pagination-display{
@@ -681,20 +683,40 @@ div#tab-vanna{
     font-size: 16px;
 }
-#table-names table{
-    overflow: hidden;
 }
-#table-names thead{
-    display: none;
 }
-#table-names tr{
-    cursor:pointer
 }
-#table-names tr:hover{
     background-color: #f0f8ff;
 }
 /* DRIAS Data Table Styles */
 #vanna-table {
     height: 400px !important;
@@ -717,3 +739,13 @@ div#tab-vanna{
     background: white;
     z-index: 1;
 }

     /* overflow-y: scroll; */
 }
 #sql-query{
+    max-height: 100%;
 }
 #sql-query textarea{
+    min-height: 200px !important;
 }
 #sql-query span{
     max-height: 100¨vh;
     overflow-y: hidden;
 }
+#details button span{
+    font-weight: bold;
+}
 #vanna-plot{
+    max-height:1000px
 }
 #pagination-display{
     font-size: 16px;
 }
+#table-names label:nth-child(odd) {
+    background-color: #f9f9f9;
 }
+#table-names label:nth-child(even) {
+    background-color: #e6f0ff;
 }
+#table-names label {
+    display: block;              /* Chaque option prend toute la ligne */
+    width: 100%;                 /* Chaque option remplit l'espace horizontal */
+    box-sizing: border-box;
+    padding: 8px 12px;
+    margin-bottom: 4px;
+    border: 1px solid #ccc;
+    border-radius: 6px;
+    background-color: white;
+    cursor: pointer;
+    text-align: center;
 }
+#table-names label:hover {
     background-color: #f0f8ff;
 }
+#table-names input[type="radio"] {
+    display: none;
+}
+#table-names input[type="radio"]:checked + label {
+    background-color: #d0eaff;
+    border-color: #2196f3;
+}
 /* DRIAS Data Table Styles */
 #vanna-table {
     height: 400px !important;
     background: white;
     z-index: 1;
 }
+.example-img{
+    height: 250px;
+    object-fit: contain;
+}
+#example-img-container {
+    flex-direction: column;
+    align-items: left;
+}