Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

Nathan Brake commited on Apr 7

Commit

cdd4ebc

unverified ·

1 Parent(s): cf3dfa0

Update to work with newer any-agent code (#51)

Browse files

* Update to work with newer any-agent code

* format

Files changed (25) hide show

.gitignore +1 -1
examples/langchain_single_agent_vertical.yaml +2 -1
examples/llama_index_single_agent_vertical.yaml +17 -0
examples/openai_single_agent_user_confirmation.yaml +1 -1
examples/openai_single_agent_vertical.yaml +4 -4
examples/smolagents_single_agent_vertical.yaml +2 -1
notebooks/experiment/agent_configs/langchain-4o-mini.yaml +26 -0
notebooks/experiment/agent_configs/langchain-4o.yaml +26 -0
notebooks/experiment/agent_configs/openai-4o-mini.yaml +26 -0
notebooks/experiment/agent_configs/openai-4o.yaml +26 -0
notebooks/experiment/agent_configs/openai-o1.yaml +26 -0
notebooks/experiment/agent_configs/openai-o3-mini.yaml +26 -0
notebooks/experiment/agent_configs/smolagents-4o-mini.yaml +26 -0
notebooks/experiment/agent_configs/smolagents-4o.yaml +26 -0
notebooks/experiment/agent_configs/smolagents-o1.yaml +26 -0
notebooks/experiment/agent_configs/smolagents-o3-mini.yaml +26 -0
notebooks/experiment/agent_configs/smolagents-ollama-llama3.1-8b-fp16.yaml +26 -0
notebooks/experiment/agent_configs/smolagents-ollama-llama3.1-8b-q4.yaml +26 -0
notebooks/experiment/compare.ipynb +772 -0
{src/surf_spot_finder/evaluation → notebooks/experiment}/test_cases/alpha.yaml +4 -4
pyproject.toml +3 -1
src/surf_spot_finder/cli.py +1 -1
src/surf_spot_finder/evaluation/main.py +17 -30
src/surf_spot_finder/evaluation/results_saver.py +7 -3
src/surf_spot_finder/evaluation/test_case.py +18 -1

.gitignore CHANGED Viewed

@@ -168,4 +168,4 @@ cython_debug/
 .vscode/
 output
-traces

 .vscode/
 output
+traces

examples/langchain_single_agent_vertical.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 location: Pontevedra
-date: 2025-04-02 12:00
 max_driving_hours: 2
 framework: langchain
@@ -13,4 +13,5 @@ main_agent:
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
   - "any_agent.tools.visit_webpage"

 location: Pontevedra
+date: 2025-04-10 12:00
 max_driving_hours: 2
 framework: langchain
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
+  - "any_agent.tools.show_plan"
   - "any_agent.tools.visit_webpage"

examples/llama_index_single_agent_vertical.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+location: Pontevedra
+date: 2025-04-10 12:00
+max_driving_hours: 2
+framework: llama_index
+main_agent:
+  model_id: o3-mini
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.search_web"
+  - "any_agent.tools.show_plan"
+  - "any_agent.tools.visit_webpage"

examples/openai_single_agent_user_confirmation.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 location: Pontevedra
-date: 2025-04-05 12:00
 max_driving_hours: 2
 input_prompt_template: |
   According to the forecast, what will be the best spot to surf around {LOCATION},

 location: Pontevedra
+date: 2025-04-10 12:00
 max_driving_hours: 2
 input_prompt_template: |
   According to the forecast, what will be the best spot to surf around {LOCATION},

examples/openai_single_agent_vertical.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 location: Pontevedra
-date: 2025-04-02 12:00
 max_driving_hours: 2
 framework: openai
@@ -12,6 +12,6 @@ main_agent:
   - "surf_spot_finder.tools.get_surfing_spots"
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
-  - "surf_spot_finder.tools.search_web"
-  - "surf_spot_finder.tools.show_plan"
-  - "surf_spot_finder.tools.visit_webpage"

 location: Pontevedra
+date: 2025-04-10 12:00
 max_driving_hours: 2
 framework: openai
   - "surf_spot_finder.tools.get_surfing_spots"
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.search_web"
+  - "any_agent.tools.show_plan"
+  - "any_agent.tools.visit_webpage"

examples/smolagents_single_agent_vertical.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 location: Pontevedra
-date: 2025-04-02 12:00
 max_driving_hours: 2
 framework: smolagents
@@ -13,6 +13,7 @@ main_agent:
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
   - "any_agent.tools.visit_webpage"
   - "smolagents.PythonInterpreterTool"
   - "smolagents.FinalAnswerTool"

 location: Pontevedra
+date: 2025-04-10 12:00
 max_driving_hours: 2
 framework: smolagents
   - "surf_spot_finder.tools.get_wave_forecast"
   - "surf_spot_finder.tools.get_wind_forecast"
   - "any_agent.tools.search_web"
+  - "any_agent.tools.show_plan"
   - "any_agent.tools.visit_webpage"
   - "smolagents.PythonInterpreterTool"
   - "smolagents.FinalAnswerTool"

notebooks/experiment/agent_configs/langchain-4o-mini.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: langchain
+main_agent:
+  model_id: openai/gpt-4o-mini
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/langchain-4o.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: langchain
+main_agent:
+  model_id: openai/gpt-4o
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/openai-4o-mini.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: openai
+main_agent:
+  model_id: gpt-4o-mini
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/openai-4o.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: openai
+main_agent:
+  model_id: gpt-4o
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/openai-o1.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: openai
+main_agent:
+  model_id: o1
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/openai-o3-mini.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: openai
+main_agent:
+  model_id: o3-mini
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/smolagents-4o-mini.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: smolagents
+main_agent:
+  model_id: openai/gpt-4o-mini
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - "smolagents.FinalAnswerTool"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/smolagents-4o.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: smolagents
+main_agent:
+  model_id: openai/gpt-4o
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - "smolagents.FinalAnswerTool"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/smolagents-o1.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: smolagents
+main_agent:
+  model_id: openai/o1
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - "smolagents.FinalAnswerTool"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/smolagents-o3-mini.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: smolagents
+main_agent:
+  model_id: openai/o3-mini
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - "smolagents.FinalAnswerTool"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/smolagents-ollama-llama3.1-8b-fp16.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: smolagents
+main_agent:
+  model_id: ollama/llama3.1:8b-instruct-fp16
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - "smolagents.FinalAnswerTool"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/agent_configs/smolagents-ollama-llama3.1-8b-q4.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+framework: smolagents
+main_agent:
+  model_id: ollama/llama3.1:latest
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "any_agent.tools.send_console_message"
+  - "smolagents.FinalAnswerTool"
+  - command: "docker"
+    args:
+      - "run"
+      - "-i"
+      - "--rm"
+      - "--mount"
+      - "type=bind,src=/tmp/surf-spot-finder,dst=/projects"
+      - "mcp/filesystem"
+      - "/projects"
+    tools:
+      - "read_file"
+      - "write_file"
+      - "directory_tree"
+      - "list_allowed_directories"

notebooks/experiment/compare.ipynb ADDED Viewed

	@@ -0,0 +1,772 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Which Agent is Best?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from surf_spot_finder.utils.logging import get_logger\n",
+    "import pandas as pd\n",
+    "\n",
+    "logger = get_logger()\n",
+    "\n",
+    "\n",
+    "def load_results():\n",
+    "    results_path = \"./results.json\"\n",
+    "    if not os.path.exists(results_path):\n",
+    "        logger.info(\"No results found, skipping loading.\")\n",
+    "        return pd.DataFrame()\n",
+    "    df = pd.read_json(results_path, lines=True)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Need nest_asyncio to run the evaluation in a notebook\n",
+    "from surf_spot_finder.evaluation.main import evaluate\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "test_case_path = \"./test_cases/alpha.yaml\"\n",
+    "configs = [\n",
+    "    \"langchain-4o\",\n",
+    "    \"openai-4o\",\n",
+    "    \"smolagents-4o\",\n",
+    "    \"smolagents-4o-mini\",\n",
+    "    \"openai-4o-mini\",\n",
+    "    \"smolagents-o3-mini\",\n",
+    "    \"openai-o3-mini\",\n",
+    "    \"smolagents-o1\",\n",
+    "    \"openai-o1\",\n",
+    "    \"smolagents-ollama-llama3.1-8b-q4\",\n",
+    "    \"smolagents-ollama-llama3.1-8b-fp16\",\n",
+    "]\n",
+    "results_df = load_results()\n",
+    "for agent in configs:\n",
+    "    agent_config_path = f\"./agent_configs/{agent}.yaml\"\n",
+    "    # check if the agent config is already in the results\n",
+    "    if (\n",
+    "        not results_df.empty\n",
+    "        and results_df[results_df[\"agent_config_path\"] == agent_config_path].shape[0]\n",
+    "        > 0\n",
+    "    ):\n",
+    "        logger.info(f\"Already evaluated {agent}\")\n",
+    "        continue\n",
+    "    logger.info(f\"Evaluating {agent}\")\n",
+    "    evaluate(\n",
+    "        test_case_path=test_case_path,\n",
+    "        agent_config_path=agent_config_path,\n",
+    "        telemetry_path=None,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "==========================\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Summary:\n",
+      "                                    agent_config_path  score\n",
+      "                  ./agent_configs/openai-4o-mini.yaml  92.86\n",
+      "                       ./agent_configs/openai-o1.yaml  92.86\n",
+      "                       ./agent_configs/openai-4o.yaml  85.71\n",
+      "                   ./agent_configs/smolagents-4o.yaml  85.71\n",
+      "                  ./agent_configs/openai-o3-mini.yaml  85.71\n",
+      "                   ./agent_configs/smolagents-o1.yaml  85.71\n",
+      "                    ./agent_configs/langchain-4o.yaml  57.14\n",
+      "              ./agent_configs/smolagents-4o-mini.yaml  57.14\n",
+      "              ./agent_configs/smolagents-o3-mini.yaml  50.00\n",
+      "./agent_configs/smolagents-ollama-llama3.1-8b-q4.yaml   0.00\n",
+      "==========================\n",
+      "Agent config: ./agent_configs/openai-4o-mini.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: ### Surf Location: T Street Beach\n",
+      "\n",
+      "- **Coordinates:** 33.416044, -117.617257  \n",
+      "- **Wave Height:** 1.34 meters  \n",
+      "- **Wave Direction:** 256 degrees  \n",
+      "- **Wave Period:** 10.25 seconds  \n",
+      "- **Wind Speed:** 13.6 m/s from the SW (212 degrees)  \n",
+      "\n",
+      "### Weather Conditions (March 29, 2025, at 22:00):\n",
+      "- Clear skies with a gentle breeze.\n",
+      "- Ideal conditions for surfing. \n",
+      "\n",
+      "I have discussed this with David de la Iglesia Castro, and he has chosen T Street Beach.\n",
+      "\n",
+      "The details have been saved in the file `/projects/final_answer.txt`. Happy surfing!\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool in Call 3, and it succeeded by returning a list of surfing spots.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent used the get_wave_forecast tool in Call 4 and Call 6 and both were successful, returning detailed wave forecast information.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The criterion was satisfied because the agent used the get_wind_forecast tool in two instances (Call 5 and Call 7) and both calls succeeded with valid output data for wind forecasts at the specified time and location.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent successfully called the get_area_lat_lon tool in Call 1 and received the output with the coordinates for Huntington Beach, California.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent used the driving_hours_to_meters tool to convert 1 driving hour to 70000 meters in Call 2, and the conversion succeeded as shown by the output.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The agent confirmed the selection with David de la Iglesia Castro by sending him messages to discuss the options, and David's preference for 'T Street' was indicated in the responses.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer includes a description of the weather at the chosen location, mentioning clear skies and a gentle breeze, indicating ideal conditions for surfing.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contains 'T Street Beach', which is one of the surf spots found by the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent mentioned 'T Street Beach', which is a commonly known reference to T Street in San Clemente, California.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 5.33\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 9\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 1\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 13/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 92.86\n",
+      "Agent config: ./agent_configs/openai-o1.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: Chosen Location: T Street Beach, San Clemente\n",
+      "\n",
+      "Date/Time: 2025-03-29 22:00:00\n",
+      "\n",
+      "Wave Conditions:\n",
+      "• Wave Height: ~1.34 m  \n",
+      "• Wave Direction: ~256°  \n",
+      "• Wave Period: ~10.25 seconds  \n",
+      "\n",
+      "Wind Conditions:\n",
+      "• Wind Direction: ~212°  \n",
+      "• Wind Speed: ~13.6 m/s  \n",
+      "\n",
+      "Weather:\n",
+      "Expect mild temperatures around the mid-60s Fahrenheit with partly cloudy skies, a gentle ocean breeze, and mostly dry conditions.\n",
+      "\n",
+      "Summary:\n",
+      "T Street Beach should offer reliable, moderate surf with a comfortable wind direction and wave height for intermediate surfers. The conditions are forecast to be stable, making this a great spot to catch some waves under moderate offshore breezes.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool and successfully obtained surfing spot data, as evidenced by Call 3.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent used the get_wave_forecast tool successfully in Calls 4 and 5, receiving valid wave forecast data as output.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The agent used the get_wind_forecast tool in Call 6 and Call 7, and both calls returned successful outputs.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool with the input 'Huntington Beach, California' and received a successful output with latitude and longitude coordinates.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent used the driving_hours_to_meters tool in Call 2 to convert 1 driving hour to 70000 meters, and the operation succeeded as evidenced by the output.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The agent confirmed the selection with David de la Iglesia Castro by messaging him and receiving a response indicating a preference for T Street Beach.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer contains a description of the weather at T Street Beach, San Clemente, mentioning mild temperatures, partly cloudy skies, a gentle ocean breeze, and mostly dry conditions.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contains 'T Street Beach', which is one of the surf spots found by the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer specifies T Street Beach, San Clemente, which matches the expected location T Street, San Clemente, California.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 8.25\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 9\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 1\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 13/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 92.86\n",
+      "Agent config: ./agent_configs/openai-4o.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: Surf Location: T Street Beach\n",
+      "\n",
+      "**Wave Conditions**\n",
+      "- **Date:** 2025-03-29\n",
+      "- **Time:** 22:00\n",
+      "- **Wave Height:** 1.34 meters\n",
+      "- **Wave Direction:** 256°\n",
+      "- **Wave Period:** 10.25 seconds\n",
+      "\n",
+      "**Wind Conditions**\n",
+      "- **Wind Direction:** 212°\n",
+      "- **Wind Speed:** 13.6 m/s\n",
+      "\n",
+      "**Weather Conditions**\n",
+      "- Windy conditions expected with moderate wave height and a good wave period for surfing.\n",
+      "\n",
+      "T Street Beach offers great conditions for surfing with a comfortable wave height and ideal wave period for riding. The wind direction will be favorable, promoting smooth and rideable waves. Perfect for surfing enthusiasts looking for some thrilling action in the water!\n",
+      "\n",
+      "The details have been documented in `/projects/final_answer.txt`.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool in Call 3 and it succeeded, providing a list of surfing spots.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent successfully used the get_wave_forecast tool in Call 4 and received a valid output with wave forecast details.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The agent used the get_wind_forecast tool successfully as shown in Call 5, retrieving wind data for the specified location and time.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool in Call 1 and it returned successful coordinates for Huntington Beach, California.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent used the driving_hours_to_meters tool in Call 2 to convert driving hours to meters, and it succeeded with an output of 70000 meters.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer contains a description of the weather conditions at T Street Beach, noting 'Stormy wind conditions expected with moderate wave height and good wave period for surfing.'\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contains the surf spot 'T Street Beach', which was identified using the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer provides the surf location as 'T Street Beach', which aligns with the expected location 'T Street, San Clemente, California'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- There is no evidence that David de la Iglesia Castro confirmed the selection. The agent sent a message with the surf spot details but did not receive a confirmation from David.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 4.17\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 8\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 2\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 12/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 85.71\n",
+      "Agent config: ./agent_configs/smolagents-4o.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: Surfing Location: T Street Beach\n",
+      "Coordinates: Latitude 33.416044049999996, Longitude -117.61725655000001\n",
+      "Date & Time: 2025-03-29 22:00\n",
+      "Wave Conditions:\n",
+      "  - Wave Height: 1.34m\n",
+      "  - Wave Direction: 256°\n",
+      "  - Wave Period: 10.25s\n",
+      "Wind Conditions:\n",
+      "  - Wind Speed: 13.6 m/s\n",
+      "  - Wind Direction: 212°\n",
+      "\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool in Call 7, and it succeeded.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The criterion was satisfied because the agent successfully used the get_wave_forecast tool, as indicated by Calls 9 and 11, where the status was 'success'.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The agent used the get_wind_forecast tool successfully with the appropriate inputs in Calls 10 and 12.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool in Call 3 and it succeeded with a 'success' status.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent used the driving_hours_to_meters tool to convert the driving hours to meters successfully, as evidenced by Call 5 showing the conversion of 1 driving hour to 70,000 meters without errors.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The agent confirmed the selection with David de la Iglesia Castro as the agent received a response indicating the preference for 'T Street Beach'.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contained 'T Street Beach', which was one of the surfing spots found using the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer specifies the surf location as 'T Street Beach', which corresponds to 'T Street, San Clemente, California'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer written to the file contains wave and wind details but does not explicitly include weather conditions.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 10.26\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 8\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 2\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 12/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 85.71\n",
+      "Agent config: ./agent_configs/openai-o3-mini.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: Final Surf Recommendation:\n",
+      "\n",
+      "Location: T Street Beach  \n",
+      "Coordinates: 33.41604405, -117.61725655\n",
+      "\n",
+      "Surf Conditions for 2025-03-29 at 22:00:  \n",
+      "• Wave Direction: Approximately 256°  \n",
+      "• Wave Height: Approximately 1.34 meters  \n",
+      "• Wave Period: Approximately 10.25 seconds  \n",
+      "• Sea Level Adjustment: Approximately -0.78 meters MSL\n",
+      "\n",
+      "Wind Conditions at 22:00:  \n",
+      "• Wind Direction: Approximately 212°  \n",
+      "• Wind Speed: Approximately 13.6 m/s\n",
+      "\n",
+      "Weather Overview:  \n",
+      "At T Street Beach, the forecast suggests a well-balanced and enjoyable surf session. The waves are consistent, with a slight decrease in height and a stable period, providing reliable conditions for carving and riding. The wind is moderate, with a gentle shift that will help to keep the waves well-formed without generating excessive chop, thereby preserving the overall quality of the surf. The overall weather is expected to be favorable, making it an excellent spot for intermediate and experienced surfers looking for a thrilling ride in a relaxed setting.\n",
+      "\n",
+      "This final recommendation has been reviewed and agreed upon with David de la Iglesia Castro. A detailed description also has been saved in the file /projects/final_answer.txt. Enjoy your surf session!\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool in Call 3 and it succeeded by retrieving surf spot information.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent successfully used the get_wave_forecast tool in Call 4, with a valid output containing wave conditions, thus satisfying the criterion.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The criterion was satisfied as the agent used the get_wind_forecast tool in Call 5, and it succeeded in providing the wind forecast data.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool in Call 1 with the input area name 'Huntington Beach, California' and received a successful output of coordinates '[\"33.6783336\", \"-118.000016\"]'.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent used the driving_hours_to_meters tool in Call 2 to convert driving hours to meters and it succeeded, as evidenced by the output of 70000 meters.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer includes a section labeled 'Weather Overview', which describes the weather conditions at the chosen location, including surf and wind conditions, and their suitability for surfing.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contains 'T Street Beach', which is one of the surf spots found via the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer identifies the surf location as T Street Beach, which matches the expected location 'T Street, San Clemente, California'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The evidence does not show a confirmation from David de la Iglesia Castro agreeing with the choice. The message was sent, but there is no record of his response confirming the selection.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 2.41\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 8\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 2\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 12/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 85.71\n",
+      "Agent config: ./agent_configs/smolagents-o1.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: We've decided on T Street Beach. On 2025-03-29 22:00, the wave height is about 1.34m with a period of ~10.25s, and wave direction around 256°. The wind is coming from ~212° at ~13.6 m/s, which is moderately strong. The sea level height MSL is ~-0.78m. Overall, it should be decent surf conditions with a moderate offshore wind.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool in Call 6 and it reported a success status.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent used the get_wave_forecast tool in Calls 8 and 12, and both executions were successful.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The criterion was met because the agent used the get_wind_forecast tool successfully in calls 10 and 14.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool, and it succeeded according to Call 2, where the status is 'success'.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent successfully used the driving_hours_to_meters tool to convert the driving hours to meters, as evidenced by Call 3 and Call 4 where the tool was used and returned a successful output of 70,000 meters.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The agent confirmed the selection with David de la Iglesia Castro through a console message, and David responded with his preference for T Street Beach.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer contains a description about weather conditions at T Street Beach, including wave height, wave period, wave direction, wind direction, and wind speed.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer specifies T Street Beach, which is approximately 'T Street, San Clemente, California'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer does not contain one of the surf spots found by a call of the get_surfing_spots tool because there was no successful output from the get_surfing_spots call.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 7.02\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 8\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 2\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 12/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 85.71\n",
+      "Agent config: ./agent_configs/langchain-4o.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: I have identified T Street Beach as the preferred surfing spot for 2025-03-29 at 22:00 based on the wave and wind conditions. To document this information in the file \"/projects/final_answer.txt\", please run the following command in your environment where file writing is supported:\n",
+      "\n",
+      "```plaintext\n",
+      "**Surfing Location:** T Street Beach\n",
+      "\n",
+      "**Date & Time:** 2025-03-29 22:00\n",
+      "\n",
+      "**Wave Conditions:**\n",
+      "- **Wave Height:** 1.34 meters\n",
+      "- **Wave Direction:** 256Â°\n",
+      "- **Wave Period:** 10.25 seconds\n",
+      "- **Sea Level Height:** -0.78 meters MSL\n",
+      "\n",
+      "**Wind Conditions:**\n",
+      "- **Wind Direction:** 212Â°\n",
+      "- **Wind Speed:** 13.6 meters/second\n",
+      "\n",
+      "**Weather Summary:**\n",
+      "T Street Beach offers excellent surfing conditions with a moderate wave height and optimal wave period for surfing. The wind is blowing from the southwest at a speed that is manageable for surfers, allowing for steady rides on the waves. The sea level is slightly lower than average, which may affect certain beach breaks. Overall, it's shaping up to be a great evening for surfing at T Street Beach.\n",
+      "```\n",
+      "\n",
+      "Please make sure to transfer this content to the specified file within your system.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent used the 'get_wave_forecast' tool, and its execution was marked as 'success' with no errors, meeting the criterion.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer contains a detailed description of the wave and wind conditions at T Street Beach, as well as a weather summary, indicating that the weather at the chosen location is described.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contains 'T Street Beach', which is one of the surf spots identified in the process.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer mentions 'T Street Beach,' which is approximately the same location as 'T Street, San Clemente, California.'\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The telemetry evidence does not show any instance of the agent using the 'get_surfing_spots' tool. The tools used in the evidence are 'get_wave_forecast', 'send_console_message', and 'write_file'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The agent did not use the get_wind_forecast tool; it used the get_wave_forecast tool instead.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The telemetry evidence does not contain any call to the 'get_area_lat_lon' tool.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- There is no evidence in the telemetry data that the 'driving_hours_to_meters' tool was used, nor any indication of its success or failure.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The evidence shows a message sent to David de la Iglesia Castro about the surfing spots, but there is no confirmation from David that he agreed with the selection.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 2.52\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 4\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 6\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 8/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 57.14\n",
+      "Agent config: ./agent_configs/smolagents-4o-mini.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: Final Surf Location Summary:\n",
+      "\n",
+      "Location: The Pier Beach\n",
+      "- Wave Height: 1.5 meters\n",
+      "- Wave Direction: 257 degrees\n",
+      "- Wave Period: 9.95 seconds\n",
+      "- Wind Speed: 13.6 m/s (Direction: 212 degrees)\n",
+      "Weather conditions are favorable for surfing.\n",
+      "\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the 'get_surfing_spots' tool successfully in Call 4, as indicated by the 'success' status.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The criterion was satisfied because the agent used the get_wave_forecast tool in Calls 6 and 8, and both instances succeeded without errors.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The agent used the get_wind_forecast tool twice, and both instances succeeded according to the telemetry evidence.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool, and the execution was successful as indicated by the 'status': 'success' in Call 2.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent successfully used the driving_hours_to_meters tool to convert 1 hour of driving into 70,000 meters.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The agent confirmed the selection with David de la Iglesia Castro by sending a message and receiving a response indicating a preference for T Street Beach.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer contains a description of the surfing conditions, including weather conditions, at The Pier Beach.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer contains The Pier Beach, which was found by the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer provided a location of 'The Pier Beach', which does not match the expected 'T Street, San Clemente, California'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 0.0\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 8\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 2\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 8/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 57.14\n",
+      "Agent config: ./agent_configs/smolagents-o3-mini.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: Surf Location: T Street Beach near Huntington Beach, California (Coordinates: 33.41604405, -117.61725655).\n",
+      "Forecast for 2025-03-29 22:00:\n",
+      "  - Wave conditions: A wave height of 1.34 m with a period of 10.25 seconds and wave direction of 256°. Sea level is measured at -0.78 m relative to mean sea level, indicating well-formed swell.\n",
+      "  - Wind conditions: Winds blowing at 13.6 m/s from a direction of 212°, providing a robust and consistent breeze.\n",
+      "Overall, T Street Beach is set to offer clear skies, a vibrant ocean atmosphere, and excellent surfing conditions. The combination of steady, moderate swell and strong winds makes it an ideal location for surfing. Additional local weather factors such as temperature, humidity, and cloud cover are expected to further enhance the outdoor surfing experience.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- The agent used the get_surfing_spots tool in Call 5, and it succeeded without errors.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The agent used the get_wave_forecast tool and it succeeded, as indicated by Call 9 with a status of success.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The criterion was satisfied because the agent successfully invoked the 'get_wind_forecast' tool (Call 10) with the specified coordinates and date, and the call was marked as successful.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- The agent used the get_area_lat_lon tool in Call 2, and it succeeded as indicated by the status 'success'.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The agent successfully used the driving_hours_to_meters tool in Call 4 to convert 1 driving hour to 70,000 meters, as intended.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- The agent confirmed the selection with David de la Iglesia Castro and adjusted the final choice based on his response, showing that confirmation was achieved.\u001b[0m\n",
+      "\n",
+      "\u001b[32mPassed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The final answer includes a detailed description of the weather conditions at T Street Beach, including wave height, wind speed, and direction, as well as additional local weather factors such as clear skies, temperature, humidity, and cloud cover.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- The final answer mentions 'T Street Beach', which was not obtained from the get_surfing_spots tool as its output was 'No output found'.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent mentioned T Street Beach near Huntington Beach, which is incorrect as the expected location is T Street, San Clemente, California.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 5.04\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 7\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 3\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 7/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 50.0\n",
+      "Agent config: ./agent_configs/smolagents-ollama-llama3.1-8b-q4.yaml\n",
+      "\u001b[33mHypothesis Final answer extracted: I'd be happy to help you review the provided code!\n",
+      "\n",
+      "**Overall Code Quality**\n",
+      "\n",
+      "The code is well-structured, readable, and follows good practices. It's easy to understand what each part of the code does. However, there are a few minor issues that can be improved:\n",
+      "\n",
+      "1.  **Error Handling**: The `_make_api_call` method doesn't handle errors properly. When an API call fails, it returns `None`, which might cause unexpected behavior downstream. Consider adding try-except blocks to handle exceptions and return meaningful error messages.\n",
+      "2.  **Code Duplication**: The `get_weather`, `search`, and `get_wave_height` methods all use the `_make_api_call` method. This is a good example of DRY (Don't Repeat Yourself) principle, but you can take it a step further by using a more generic function that takes in the API endpoint URL and parameters.\n",
+      "3.  **Magic Numbers**: The code uses magic numbers like `37.7749` for San Francisco's latitude and `-122.4194` for its longitude. Consider defining constants or enums to make these values more readable and maintainable.\n",
+      "\n",
+      "**Minor Improvements**\n",
+      "\n",
+      "1.  **Type Hints**: Add type hints for function parameters and return types to improve code readability and help catch errors early.\n",
+      "2.  **Docstrings**: Use docstrings to provide a brief description of each method's purpose, parameters, and return values.\n",
+      "3.  **Consistent Naming Conventions**: The code uses both camelCase and underscore notation for variable names. Stick to one convention throughout the code.\n",
+      "\n",
+      "**Best Practices**\n",
+      "\n",
+      "1.  **Testing**: Write unit tests to ensure that each function works correctly in isolation.\n",
+      "2.  **Code Formatting**: Use a consistent coding style and format the code with tools like Black or autopep8.\n",
+      "3.  **API Documentation**: Document your API endpoints, parameters, and responses to make it easier for others (or yourself) to understand how to use them.\n",
+      "\n",
+      "Overall, the code is well-written and easy to follow. With these minor improvements and best practices in mind, you'll be able to maintain a high-quality codebase that's easy to understand and extend.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_surfing_spots tool and it succeeded\n",
+      "- There is no evidence of the agent using the get_surfing_spots tool in the provided telemetry data.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_wave_forecast tool and it succeeded\n",
+      "- The evidence does not show any use of a tool named 'get_wave_forecast', nor does it indicate success with such a tool.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_wind_forecast tool and it succeeded\n",
+      "- The evidence does not indicate that the get_wind_forecast tool was used or that its execution succeeded. The evidence only shows usage of the get_weather functionality and other related code discussions.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the get_area_lat_lon tool and it succeeded\n",
+      "- There is no evidence that the get_area_lat_lon tool was used or mentioned. The focus was on weather data and API interactions.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent used the driving_hours_to_meters tool to convert the driving hours to meters and it succeeded\n",
+      "- The telemetry evidence does not show any usage of the driving_hours_to_meters tool for converting driving hours to meters. The interactions mainly involve weather data retrieval and code structure improvements.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the agent confirmed the selection with David de la Iglesia Castro\n",
+      "- There is no evidence in the provided telemetry data indicating that the agent confirmed the selection with David de la Iglesia Castro.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the final answer contains any description about the weather at the chosen location\n",
+      "- The provided telemetry evidence describes the steps taken to develop code that interacts with weather and surf spot APIs but does not show the final answer including any description about the weather at the chosen location.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if the final answer contains one of the surf spots found by a call of the get_surfing_spots tool\n",
+      "- There is no evidence from the telemetry data showing that the final answer includes a surf spot found using the get_surfing_spots tool.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Check if Surf location is approximately 'T Street, San Clemente, California'.\n",
+      "- The agent's answer does not mention or evaluate the surf location 'T Street, San Clemente, California'. It focuses solely on reviewing code quality and does not address the criterion.\u001b[0m\n",
+      "\n",
+      "\u001b[31mFailed:\n",
+      "- Is the answer a direct match?\n",
+      "- Partial Match (F1) score is 0.67\u001b[0m\n",
+      "\u001b[32mPassed checkpoints: 0\u001b[0m\n",
+      "\u001b[31mFailed checkpoints: 10\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\u001b[32mScore: 0/14\u001b[0m\n",
+      "\u001b[32m=====================================\u001b[0m\n",
+      "\n",
+      "Score: 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = load_results()\n",
+    "# print out the score and config path columns\n",
+    "logger.info(\"==========================\")\n",
+    "logger.info(\"Summary:\")\n",
+    "df = df.sort_values(by=\"score\", ascending=False)\n",
+    "summary = df[[\"agent_config_path\", \"score\"]]\n",
+    "logger.info(summary.to_string(index=False))\n",
+    "logger.info(\"==========================\")\n",
+    "\n",
+    "# sort the df by score\n",
+    "\n",
+    "# for each row, print out the score\n",
+    "for index, row in df.iterrows():\n",
+    "    logger.info(f\"Agent config: {row['agent_config_path']}\")\n",
+    "    logger.info(row[\"output_message\"])\n",
+    "    logger.info(f\"Score: {row['score']}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

{src/surf_spot_finder/evaluation → notebooks/experiment}/test_cases/alpha.yaml RENAMED Viewed

@@ -1,10 +1,10 @@
-# Test case for surf spot finder
 # You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
 # is ignored
 input:
   location: "Huntington Beach, California"
-  date: "2025-03-29 22:00"
   max_driving_hours: 1
   input_prompt_template: |
     Please help find the best place to surf around {LOCATION},
@@ -17,14 +17,14 @@ input:
 ground_truth:
   - name: "Surf location"
     points: 5
-    value: "Bolsa Chica State Beach"
 # Base checkpoints for agent behavior
 # These evaluators for these checkpoints
 # will not consider the hypothesis answer or final answer in their decision making
 checkpoints:
   - points: 1
-    criteria: "Check if the agent used the get_surfing_spots tool and it succeeded"
   - points: 1
     criteria: "Check if the agent used the get_wave_forecast tool and it succeeded"
   - points: 1

+output_path: "results.json"
 # You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
 # is ignored
 input:
   location: "Huntington Beach, California"
+  date: "2025-04-10 14:00"
   max_driving_hours: 1
   input_prompt_template: |
     Please help find the best place to surf around {LOCATION},
 ground_truth:
   - name: "Surf location"
     points: 5
+    value: "T Street, San Clemente, California"
 # Base checkpoints for agent behavior
 # These evaluators for these checkpoints
 # will not consider the hypothesis answer or final answer in their decision making
 checkpoints:
   - points: 1
+    criteria: "Check if the agent used the get_surfing_spots tool and it succeeded, and that the tool was used before the get_wave_forecast and get_wind_forecast tools"
   - points: 1
     criteria: "Check if the agent used the get_wave_forecast tool and it succeeded"
   - points: 1

pyproject.toml CHANGED Viewed

@@ -37,7 +37,9 @@ tests = [
 ]
 dev = [
-    "pre-commit>=4.1.0",
 ]
 [project.urls]

 ]
 dev = [
+  "pre-commit>=4.1.0",
+  "ipykernel>=6.29.5",
+  "jupyter>=1.1.1",
 ]
 [project.urls]

src/surf_spot_finder/cli.py CHANGED Viewed

@@ -35,7 +35,7 @@ def find_surf_spot(
             config.main_agent.instructions = SINGLE_AGENT_SYSTEM_PROMPT
     logger.info("Setting up tracing")
-    tracing_path = setup_tracing(config.framework)
     logger.info(f"Loading {config.framework} agent")
     logger.info(f"{config.managed_agents}")

             config.main_agent.instructions = SINGLE_AGENT_SYSTEM_PROMPT
     logger.info("Setting up tracing")
+    tracing_path = setup_tracing(config.framework, "output")
     logger.info(f"Loading {config.framework} agent")
     logger.info(f"{config.managed_agents}")

src/surf_spot_finder/evaluation/main.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional
 from any_agent import AnyAgent
 from any_agent.telemetry import TelemetryProcessor
-from any_agent.tracing import get_tracer_provider, setup_tracing
 from fire import Fire
 from surf_spot_finder.config import (
@@ -23,37 +23,22 @@ from surf_spot_finder.utils.logging import get_logger
 logger = get_logger()
-def run(test_case: TestCase, agent_config_path: str) -> str:
-    input_data = test_case.input
-    logger.info("Loading config")
-    config = Config.from_yaml(agent_config_path)
-    # pretty print
-    logger.info(
-        f"Overriding config with test case input:\n{json.dumps(input_data.model_dump(), indent=2)}"
-    )
-    config.location = input_data.location
-    config.date = input_data.date
-    config.max_driving_hours = input_data.max_driving_hours
-    config.input_prompt_template = input_data.input_prompt_template
     logger.info("Setting up tracing")
-    tracer_provider, tracing_path = get_tracer_provider(
-        project_name="surf-spot-finder", agent_framework=config.framework
-    )
-    setup_tracing(tracer_provider, config.framework)
-    logger.info(f"Loading {config.framework} agent")
-    logger.info(f"{config.managed_agents}")
     agent = AnyAgent.create(
-        agent_framework=config.framework,
-        agent_config=config.main_agent,
-        managed_agents=config.managed_agents,
     )
-    query = config.input_prompt_template.format(
-        LOCATION=config.location,
-        MAX_DRIVING_HOURS=config.max_driving_hours,
-        DATE=config.date,
     )
     logger.info(f"Running agent with query:\n{query}")
     agent.run(query)
@@ -148,7 +133,7 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
     # Save the evaluation results
     save_evaluation_results(
-        test_case_path=test_case.test_case_path,
         output_path=test_case.output_path,
         output_message=output_message,
         telemetry_path=telemetry_path,
@@ -171,7 +156,9 @@ def evaluate(
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
-    test_case = TestCase.from_yaml(test_case_path=test_case_path)
     if telemetry_path is None:
         logger.info(
@@ -180,7 +167,7 @@ def evaluate(
         assert (
             agent_config_path is not None
         ), "Agent config path must be provided if running agent"
-        telemetry_path = run(test_case, agent_config_path)
     else:
         logger.info(f"Using provided telemetry file: {telemetry_path}")
         logger.info(

 from any_agent import AnyAgent
 from any_agent.telemetry import TelemetryProcessor
+from any_agent.tracing import setup_tracing
 from fire import Fire
 from surf_spot_finder.config import (
 logger = get_logger()
+def run(agent_config: Config) -> str:
     logger.info("Setting up tracing")
+    tracing_path = setup_tracing(agent_config.framework, "output")
+    logger.info(f"Loading {agent_config.framework} agent")
+    logger.info(f"{agent_config.managed_agents}")
     agent = AnyAgent.create(
+        agent_framework=agent_config.framework,
+        agent_config=agent_config.main_agent,
+        managed_agents=agent_config.managed_agents,
     )
+    query = agent_config.input_prompt_template.format(
+        LOCATION=agent_config.location,
+        MAX_DRIVING_HOURS=agent_config.max_driving_hours,
+        DATE=agent_config.date,
     )
     logger.info(f"Running agent with query:\n{query}")
     agent.run(query)
     # Save the evaluation results
     save_evaluation_results(
+        test_case=test_case,
         output_path=test_case.output_path,
         output_message=output_message,
         telemetry_path=telemetry_path,
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
+    test_case = TestCase.from_yaml(
+        test_case_path=test_case_path, agent_config_path=agent_config_path
+    )
     if telemetry_path is None:
         logger.info(
         assert (
             agent_config_path is not None
         ), "Agent config path must be provided if running agent"
+        telemetry_path = run(test_case.agent_config)
     else:
         logger.info(f"Using provided telemetry file: {telemetry_path}")
         logger.info(

src/surf_spot_finder/evaluation/results_saver.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import pandas as pd
 from surf_spot_finder.utils.logging import get_logger
 # Use the shared logger
@@ -8,7 +9,7 @@ logger = get_logger()
 def save_evaluation_results(
-    test_case_path: str,
     output_path: str,
     output_message: str,
     telemetry_path: str,
@@ -21,7 +22,8 @@ def save_evaluation_results(
     Save evaluation results to the specified output path.
     Args:
-        test_case_path: Path to the test case file
         output_path: Path to save the results
         output_message: Formatted output message with evaluation details
         telemetry_path: Path to the telemetry file used
@@ -44,7 +46,9 @@ def save_evaluation_results(
             pd.DataFrame(
                 [
                     {
-                        "test_case_path": test_case_path,
                         "output_message": output_message,
                         "telemetry_path": telemetry_path,
                         "hypothesis_answer": hypothesis_answer,

 import os
 import pandas as pd
+from surf_spot_finder.evaluation.test_case import TestCase
 from surf_spot_finder.utils.logging import get_logger
 # Use the shared logger
 def save_evaluation_results(
+    test_case: TestCase,
     output_path: str,
     output_message: str,
     telemetry_path: str,
     Save evaluation results to the specified output path.
     Args:
+        test_case: Path to the test case file
+        agent_config: Path to the agent configuration file
         output_path: Path to save the results
         output_message: Formatted output message with evaluation details
         telemetry_path: Path to the telemetry file used
             pd.DataFrame(
                 [
                     {
+                        "config": test_case.model_dump(),
+                        "agent_config_path": test_case.agent_config_path,
+                        "test_case_path": test_case.test_case_path,
                         "output_message": output_message,
                         "telemetry_path": telemetry_path,
                         "hypothesis_answer": hypothesis_answer,

src/surf_spot_finder/evaluation/test_case.py CHANGED Viewed

@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field, ConfigDict
 import yaml
 from litellm import validate_environment
 class InputModel(BaseModel):
     """Input configuration for the surf spot finder test case"""
@@ -30,10 +32,12 @@ class TestCase(BaseModel):
     llm_judge: str
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     test_case_path: str
     output_path: str = "output/results.json"
     @classmethod
-    def from_yaml(cls, test_case_path: str) -> "TestCase":
         """Load a test case from a YAML file and process it"""
         with open(test_case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
@@ -64,6 +68,19 @@ class TestCase(BaseModel):
             ]
         test_case_dict["test_case_path"] = test_case_path
         # verify that the llm_judge is a valid litellm model
         validate_environment(test_case_dict["llm_judge"])
         return cls.model_validate(test_case_dict)

 import yaml
 from litellm import validate_environment
+from surf_spot_finder.config import Config
 class InputModel(BaseModel):
     """Input configuration for the surf spot finder test case"""
     llm_judge: str
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     test_case_path: str
+    agent_config_path: str
+    agent_config: Config
     output_path: str = "output/results.json"
     @classmethod
+    def from_yaml(cls, test_case_path: str, agent_config_path: str) -> "TestCase":
         """Load a test case from a YAML file and process it"""
         with open(test_case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
             ]
         test_case_dict["test_case_path"] = test_case_path
+        test_case_dict["agent_config_path"] = agent_config_path
+        with open(agent_config_path, "r") as f:
+            agent_config_dict = yaml.safe_load(f)
+        agent_config_dict["location"] = test_case_dict["input"]["location"]
+        agent_config_dict["date"] = test_case_dict["input"]["date"]
+        agent_config_dict["max_driving_hours"] = test_case_dict["input"][
+            "max_driving_hours"
+        ]
+        agent_config_dict["input_prompt_template"] = test_case_dict["input"][
+            "input_prompt_template"
+        ]
+        agent_config = Config.model_validate(agent_config_dict)
+        test_case_dict["agent_config"] = agent_config
         # verify that the llm_judge is a valid litellm model
         validate_environment(test_case_dict["llm_judge"])
         return cls.model_validate(test_case_dict)