Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Apr 7

Commit

161aa8c

1 Parent(s): 0bdf2f6

add documentation

Browse files

Files changed (5) hide show

climateqa/engine/talk_to_data/main.py +49 -3
climateqa/engine/talk_to_data/plot.py +71 -28
climateqa/engine/talk_to_data/sql_query.py +33 -7
climateqa/engine/talk_to_data/utils.py +75 -15
climateqa/engine/talk_to_data/workflow.py +45 -7

climateqa/engine/talk_to_data/main.py CHANGED Viewed

@@ -4,16 +4,62 @@ import ast
 llm = get_llm(provider="openai")
-def ask_llm_to_add_table_names(sql_query, llm):
     sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query}. Just answer the query. The answer should not include ```sql\n").content
     return sql_with_table_names
-def ask_llm_column_names(sql_query, llm):
     columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query}").content
     columns_list = ast.literal_eval(columns.strip("```python\n").strip())
     return columns_list
-def ask_drias(query:str, index_state: int = 0):
     final_state = drias_workflow(query)
     sql_queries = []
     result_dataframes = []

 llm = get_llm(provider="openai")
+def ask_llm_to_add_table_names(sql_query: str, llm) -> str:
+    """Adds table names to the SQL query result rows using LLM.
+    This function modifies the SQL query to include the source table name in each row
+    of the result set, making it easier to track which data comes from which table.
+    Args:
+        sql_query (str): The original SQL query to modify
+        llm: The language model instance to use for generating the modified query
+    Returns:
+        str: The modified SQL query with table names included in the result rows
+    """
     sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query}. Just answer the query. The answer should not include ```sql\n").content
     return sql_with_table_names
+def ask_llm_column_names(sql_query: str, llm) -> list[str]:
+    """Extracts column names from a SQL query using LLM.
+    This function analyzes a SQL query to identify which columns are being selected
+    in the result set.
+    Args:
+        sql_query (str): The SQL query to analyze
+        llm: The language model instance to use for column extraction
+    Returns:
+        list[str]: A list of column names being selected in the query
+    """
     columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query}").content
     columns_list = ast.literal_eval(columns.strip("```python\n").strip())
     return columns_list
+def ask_drias(query: str, index_state: int = 0) -> tuple:
+    """Main function to process a DRIAS query and return results.
+    This function orchestrates the DRIAS workflow, processing a user query to generate
+    SQL queries, dataframes, and visualizations. It handles multiple results and allows
+    pagination through them.
+    Args:
+        query (str): The user's question about climate data
+        index_state (int, optional): The index of the result to return. Defaults to 0.
+    Returns:
+        tuple: A tuple containing:
+            - sql_query (str): The SQL query used
+            - dataframe (pd.DataFrame): The resulting data
+            - figure (Callable): Function to generate the visualization
+            - sql_queries (list): All generated SQL queries
+            - result_dataframes (list): All resulting dataframes
+            - figures (list): All figure generation functions
+            - index_state (int): Current result index
+            - table_list (list): List of table names used
+            - error (str): Error message if any
+    """
     final_state = drias_workflow(query)
     sql_queries = []
     result_dataframes = []

climateqa/engine/talk_to_data/plot.py CHANGED Viewed

@@ -12,6 +12,18 @@ from climateqa.engine.talk_to_data.sql_query import (
 class Plot(TypedDict):
     name: str
     description: str
     params: list[str]
@@ -20,26 +32,41 @@ class Plot(TypedDict):
 def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
-    """Generate the function to plot a line plot of an indicator per year at a certain location
     Args:
-        params (dict): dictionnary with the required params : model, indicator_column, location
     Returns:
-        Callable[..., Figure]: Function which can be call to create the figure with the associated dataframe
     """
     indicator = params["indicator_column"]
     location = params["location"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
     def plot_data(df: pd.DataFrame) -> Figure:
-        """Generate the figure thanks to the dataframe
         Args:
-            df (pd.DataFrame): pandas dataframe with the required data
         Returns:
-            Figure: Plotly figure
         """
         fig = go.Figure()
         if df['model'].nunique() != 1:
@@ -118,15 +145,20 @@ indicator_evolution_at_location: Plot = {
 def plot_indicator_number_of_days_per_year_at_location(
     params: dict,
 ) -> Callable[..., Figure]:
-    """Generate the function to plot a line plot of an indicator per year at a certain location
     Args:
-        params (dict): dictionnary with the required params : model, indicator_column, location
     Returns:
-        Callable[..., Figure]: Function which can be call to create the figure with the associated dataframe
     """
     indicator = params["indicator_column"]
     location = params["location"]
@@ -194,13 +226,19 @@ indicator_number_of_days_per_year_at_location: Plot = {
 def plot_distribution_of_indicator_for_given_year(
     params: dict,
 ) -> Callable[..., Figure]:
-    """Generate an histogram of the distribution of an indicator for a given year
     Args:
-        params (dict): dictionnary with the required params : model, indicator_column, year
     Returns:
-        Callable[..., Figure]: Function which can be call to create the figure with the associated dataframe
     """
     indicator = params["indicator_column"]
     year = params["year"]
@@ -257,7 +295,7 @@ def plot_distribution_of_indicator_for_given_year(
 distribution_of_indicator_for_given_year: Plot = {
     "name": "Distribution of an indicator for a given year",
-    "description": "Plot an histogram of the distribution for a given year of the values of an indicator ",
     "params": ["indicator_column", "model", "year"],
     "plot_function": plot_distribution_of_indicator_for_given_year,
     "sql_query": indicator_for_given_year_query,
@@ -267,15 +305,20 @@ distribution_of_indicator_for_given_year: Plot = {
 def plot_map_of_france_of_indicator_for_given_year(
     params: dict,
 ) -> Callable[..., Figure]:
-    """Generate a plot of the map of France for an indicator at a given year
     Args:
-        params (dict): dictionnary with the required params : model, indicator_column, year
     Returns:
-        Callable[..., Figure]: Function which can be call to create the figure with the associated dataframe
     """
     indicator = params["indicator_column"]
     year = params["year"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])

 class Plot(TypedDict):
+    """Represents a plot configuration in the DRIAS system.
+    This class defines the structure for configuring different types of plots
+    that can be generated from climate data.
+    Attributes:
+        name (str): The name of the plot type
+        description (str): A description of what the plot shows
+        params (list[str]): List of required parameters for the plot
+        plot_function (Callable[..., Callable[..., Figure]]): Function to generate the plot
+        sql_query (Callable[..., str]): Function to generate the SQL query for the plot
+    """
     name: str
     description: str
     params: list[str]
 def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
+    """Generates a function to plot indicator evolution over time at a location.
+    This function creates a line plot showing how a climate indicator changes
+    over time at a specific location. It handles temperature, precipitation,
+    and other climate indicators.
     Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - location (str): The location to plot
+            - model (str): The climate model to use
     Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
+    Example:
+        >>> plot_func = plot_indicator_evolution_at_location({
+        ...     'indicator_column': 'mean_temperature',
+        ...     'location': 'Paris',
+        ...     'model': 'ALL'
+        ... })
+        >>> fig = plot_func(df)
     """
     indicator = params["indicator_column"]
     location = params["location"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
     def plot_data(df: pd.DataFrame) -> Figure:
+        """Generates the actual plot from the data.
         Args:
+            df (pd.DataFrame): DataFrame containing the data to plot
         Returns:
+            Figure: A plotly Figure object showing the indicator evolution
         """
         fig = go.Figure()
         if df['model'].nunique() != 1:
 def plot_indicator_number_of_days_per_year_at_location(
     params: dict,
 ) -> Callable[..., Figure]:
+    """Generates a function to plot the number of days per year for an indicator.
+    This function creates a bar chart showing the frequency of certain climate
+    events (like days above a temperature threshold) per year at a specific location.
     Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - location (str): The location to plot
+            - model (str): The climate model to use
     Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
     """
     indicator = params["indicator_column"]
     location = params["location"]
 def plot_distribution_of_indicator_for_given_year(
     params: dict,
 ) -> Callable[..., Figure]:
+    """Generates a function to plot the distribution of an indicator for a year.
+    This function creates a histogram showing the distribution of a climate
+    indicator across different locations for a specific year.
     Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - year (str): The year to plot
+            - model (str): The climate model to use
     Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
     """
     indicator = params["indicator_column"]
     year = params["year"]
 distribution_of_indicator_for_given_year: Plot = {
     "name": "Distribution of an indicator for a given year",
+    "description": "Plot an histogram of the distribution for a given year of the values of an indicator",
     "params": ["indicator_column", "model", "year"],
     "plot_function": plot_distribution_of_indicator_for_given_year,
     "sql_query": indicator_for_given_year_query,
 def plot_map_of_france_of_indicator_for_given_year(
     params: dict,
 ) -> Callable[..., Figure]:
+    """Generates a function to plot a map of France for an indicator.
+    This function creates a choropleth map of France showing the spatial
+    distribution of a climate indicator for a specific year.
     Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - year (str): The year to plot
+            - model (str): The climate model to use
     Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
     """
     indicator = params["indicator_column"]
     year = params["year"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])

climateqa/engine/talk_to_data/sql_query.py CHANGED Viewed

@@ -3,16 +3,21 @@ import duckdb
 import pandas as pd
 def execute_sql_query(sql_query: str) -> pd.DataFrame:
-    """Execute the SQL Query on the sqlite database
     Args:
-        sql_query (str): sql query to execute
     Returns:
-        SqlQueryOutput: labels of the selected column and fetched data
     """
     # Execute the query
     results = duckdb.sql(sql_query)
@@ -21,6 +26,17 @@ def execute_sql_query(sql_query: str) -> pd.DataFrame:
 class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
     indicator_column: str
     latitude: str
     longitude: str
@@ -53,6 +69,16 @@ def indicator_per_year_at_location_query(
     return sql_query
 class IndicatorForGivenYearQueryParams(TypedDict, total=False):
     indicator_column: str
     year: str
     model: str

 import pandas as pd
 def execute_sql_query(sql_query: str) -> pd.DataFrame:
+    """Executes a SQL query on the DRIAS database and returns the results.
+    This function connects to the DuckDB database containing DRIAS climate data
+    and executes the provided SQL query. It handles the database connection and
+    returns the results as a pandas DataFrame.
     Args:
+        sql_query (str): The SQL query to execute
     Returns:
+        pd.DataFrame: A DataFrame containing the query results
+    Raises:
+        duckdb.Error: If there is an error executing the SQL query
     """
     # Execute the query
     results = duckdb.sql(sql_query)
 class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
+    """Parameters for querying an indicator's values over time at a location.
+    This class defines the parameters needed to query climate indicator data
+    for a specific location over multiple years.
+    Attributes:
+        indicator_column (str): The column name for the climate indicator
+        latitude (str): The latitude coordinate of the location
+        longitude (str): The longitude coordinate of the location
+        model (str): The climate model to use (optional)
+    """
     indicator_column: str
     latitude: str
     longitude: str
     return sql_query
 class IndicatorForGivenYearQueryParams(TypedDict, total=False):
+    """Parameters for querying an indicator's values across locations for a year.
+    This class defines the parameters needed to query climate indicator data
+    across different locations for a specific year.
+    Attributes:
+        indicator_column (str): The column name for the climate indicator
+        year (str): The year to query
+        model (str): The climate model to use (optional)
+    """
     indicator_column: str
     year: str
     model: str

climateqa/engine/talk_to_data/utils.py CHANGED Viewed

@@ -30,9 +30,15 @@ def detect_location_with_openai(sentence):
         return ""
 class ArrayOutput(TypedDict):
-    """Generated SQL query."""
-    array: Annotated[str, ..., "Syntactically valid python array."]
 def detect_year_with_openai(sentence: str) -> str:
     """
@@ -58,19 +64,63 @@ def detect_year_with_openai(sentence: str) -> str:
         return ""
-def detectTable(sql_query):
     pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
     matches = re.findall(pattern, sql_query)
     return matches
-def loc2coords(location: str):
     geolocator = Nominatim(user_agent="city_to_latlong")
     coords = geolocator.geocode(location)
     return (coords.latitude, coords.longitude)
-def coords2loc(coords: tuple):
     geolocator = Nominatim(user_agent="coords_to_city")
     try:
         location = geolocator.reverse(coords)
@@ -97,17 +147,28 @@ def nearestNeighbourSQL(location: tuple, table: str) -> tuple[str, str]:
 def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[str]:
-    """Detect relevant tables regarding the plot and the user input
     Args:
-        user_question (str): initial user input
-        plot (Plot): plot object for which we wanna plot
-        llm (_type_): LLM
     Returns:
-        list[str]: list of table names
     """
     # Get all table names
     table_names_list = DRIAS_TABLES
@@ -121,7 +182,6 @@ def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[str]:
         f"### List of table name : "
     )
     table_names = ast.literal_eval(
         llm.invoke(prompt).content.strip("```python\n").strip()
     )

         return ""
 class ArrayOutput(TypedDict):
+    """Represents the output of a function that returns an array.
+    This class is used to type-hint functions that return arrays,
+    ensuring consistent return types across the codebase.
+    Attributes:
+        array (str): A syntactically valid Python array string
+    """
+    array: Annotated[str, "Syntactically valid python array."]
 def detect_year_with_openai(sentence: str) -> str:
     """
         return ""
+def detectTable(sql_query: str) -> list[str]:
+    """Extracts table names from a SQL query.
+    This function uses regular expressions to find all table names
+    referenced in a SQL query's FROM clause.
+    Args:
+        sql_query (str): The SQL query to analyze
+    Returns:
+        list[str]: A list of table names found in the query
+    Example:
+        >>> detectTable("SELECT * FROM temperature_data WHERE year > 2000")
+        ['temperature_data']
+    """
     pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
     matches = re.findall(pattern, sql_query)
     return matches
+def loc2coords(location: str) -> tuple[float, float]:
+    """Converts a location name to geographic coordinates.
+    This function uses the Nominatim geocoding service to convert
+    a location name (e.g., city name) to its latitude and longitude.
+    Args:
+        location (str): The name of the location to geocode
+    Returns:
+        tuple[float, float]: A tuple containing (latitude, longitude)
+    Raises:
+        AttributeError: If the location cannot be found
+    """
     geolocator = Nominatim(user_agent="city_to_latlong")
     coords = geolocator.geocode(location)
     return (coords.latitude, coords.longitude)
+def coords2loc(coords: tuple[float, float]) -> str:
+    """Converts geographic coordinates to a location name.
+    This function uses the Nominatim reverse geocoding service to convert
+    latitude and longitude coordinates to a human-readable location name.
+    Args:
+        coords (tuple[float, float]): A tuple containing (latitude, longitude)
+    Returns:
+        str: The address of the location, or "Unknown Location" if not found
+    Example:
+        >>> coords2loc((48.8566, 2.3522))
+        'Paris, France'
+    """
     geolocator = Nominatim(user_agent="coords_to_city")
     try:
         location = geolocator.reverse(coords)
 def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[str]:
+    """Identifies relevant tables for a plot based on user input.
+    This function uses an LLM to analyze the user's question and the plot
+    description to determine which tables in the DRIAS database would be
+    most relevant for generating the requested visualization.
     Args:
+        user_question (str): The user's question about climate data
+        plot (Plot): The plot configuration object
+        llm: The language model instance to use for analysis
     Returns:
+        list[str]: A list of table names that are relevant for the plot
+    Example:
+        >>> detect_relevant_tables(
+        ...     "What will the temperature be like in Paris?",
+        ...     indicator_evolution_at_location,
+        ...     llm
+        ... )
+        ['mean_annual_temperature', 'mean_summer_temperature']
     """
     # Get all table names
     table_names_list = DRIAS_TABLES
         f"### List of table name : "
     )
     table_names = ast.literal_eval(
         llm.invoke(prompt).content.strip("```python\n").strip()
     )

climateqa/engine/talk_to_data/workflow.py CHANGED Viewed

@@ -22,6 +22,19 @@ ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
 DRIAS_DB_PATH = ROOT_PATH + "/data/drias/drias.db"
 class TableState(TypedDict):
     table_name: str
     params: dict[str, Any]
     sql_query: NotRequired[str]
@@ -30,6 +43,16 @@ class TableState(TypedDict):
     status: str
 class PlotState(TypedDict):
     plot_name: str
     tables: list[str]
     table_states: dict[str, TableState]
@@ -190,22 +213,37 @@ def find_location(user_input: str, table: str) -> Location:
     return output
 def find_year(user_input: str) -> str:
     print(f"---- Find year ---")
     year = detect_year_with_openai(user_input)
     return year
 def find_indicator_column(table: str) -> str:
-    """Retrieve the name of the indicator column within the table in the database
     Args:
-        table (str): name of the table
     Returns:
-        str: name of the indicator column
     """
     print(f"---- Find indicator column in table {table} ----")
     return INDICATOR_COLUMNS_PER_TABLE[table]

 DRIAS_DB_PATH = ROOT_PATH + "/data/drias/drias.db"
 class TableState(TypedDict):
+    """Represents the state of a table in the DRIAS workflow.
+    This class defines the structure for tracking the state of a table during the
+    data processing workflow, including its name, parameters, SQL query, and results.
+    Attributes:
+        table_name (str): The name of the table in the database
+        params (dict[str, Any]): Parameters used for querying the table
+        sql_query (str, optional): The SQL query used to fetch data
+        dataframe (pd.DataFrame | None, optional): The resulting data
+        figure (Callable[..., Figure], optional): Function to generate visualization
+        status (str): The current status of the table processing ('OK' or 'ERROR')
+    """
     table_name: str
     params: dict[str, Any]
     sql_query: NotRequired[str]
     status: str
 class PlotState(TypedDict):
+    """Represents the state of a plot in the DRIAS workflow.
+    This class defines the structure for tracking the state of a plot during the
+    data processing workflow, including its name and associated tables.
+    Attributes:
+        plot_name (str): The name of the plot
+        tables (list[str]): List of tables used in the plot
+        table_states (dict[str, TableState]): States of the tables used in the plot
+    """
     plot_name: str
     tables: list[str]
     table_states: dict[str, TableState]
     return output
 def find_year(user_input: str) -> str:
+    """Extracts year information from user input using LLM.
+    This function uses an LLM to identify and extract year information from the
+    user's query, which is used to filter data in subsequent queries.
+    Args:
+        user_input (str): The user's query text
+    Returns:
+        str: The extracted year, or empty string if no year found
+    """
     print(f"---- Find year ---")
     year = detect_year_with_openai(user_input)
     return year
 def find_indicator_column(table: str) -> str:
+    """Retrieves the name of the indicator column within a table.
+    This function maps table names to their corresponding indicator columns
+    using the predefined mapping in INDICATOR_COLUMNS_PER_TABLE.
     Args:
+        table (str): Name of the table in the database
     Returns:
+        str: Name of the indicator column for the specified table
+    Raises:
+        KeyError: If the table name is not found in the mapping
     """
     print(f"---- Find indicator column in table {table} ----")
     return INDICATOR_COLUMNS_PER_TABLE[table]