GAIA_benchmark_agent

Sleeping

gabriel-melki commited on 4 days ago

Commit

0e032a7

1 Parent(s): 68f7ba1

Reorganize tools into separate modules and add .gitignore

- Move tool functions into dedicated modules in tools/ directory
- Add comprehensive .gitignore to prevent binary files from being tracked
- Clean up old test files and logs
- Remove problematic binary files from repository

Files changed (18) hide show

.gitignore +80 -0
README.md +0 -15
__pycache__/app.cpython-313.pyc +0 -0
__pycache__/wiki_extractor.cpython-313.pyc +0 -0
__pycache__/wikipedia_tools.cpython-313.pyc +0 -0
agent.py +11 -0
app.py +25 -91
logs +0 -221
prompt.py +1 -0
requirements.txt +3 -3
test.ipynb +0 -262
test_executed.ipynb +0 -0
tools.py +0 -0
tools/file_tools.py +60 -0
tools/image_processing_tools.py +35 -0
wikipedia_tools.py → tools/wikipedia_tools.py +133 -40
tools/youtube_tools.py +113 -0
wiki_extractor.py +0 -341

.gitignore ADDED Viewed

	@@ -0,0 +1,80 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+agents_env/
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Media files (for GAIA benchmark)
+*.mp3
+*.mp4
+*.wav
+*.avi
+*.mov
+*.mkv
+*.flv
+*.webm
+# Image files (if not needed for the app)
+*.jpg
+*.jpeg
+*.png
+*.gif
+*.bmp
+*.tiff
+*.svg
+# Documents
+*.pdf
+*.doc
+*.docx
+*.xls
+*.xlsx
+*.ppt
+*.pptx
+# Logs and temporary files
+logs/
+*.log
+*.tmp
+*.temp
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+# Test files
+test*.py
+*_test.py

README.md DELETED Viewed

@@ -1,15 +0,0 @@
----
-title: Template Final Assignment
-emoji: 🕵🏻‍♂️
-colorFrom: indigo
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.25.2
-app_file: app.py
-pinned: false
-hf_oauth: true
-# optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
-hf_oauth_expiration_minutes: 480
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/app.cpython-313.pyc DELETED Viewed

Binary file (10.4 kB)

__pycache__/wiki_extractor.cpython-313.pyc DELETED Viewed

Binary file (15.6 kB)

__pycache__/wikipedia_tools.cpython-313.pyc DELETED Viewed

Binary file (11.1 kB)

agent.py CHANGED Viewed

	@@ -0,0 +1,11 @@

+from smolagents import CodeAgent
+from prompt import get_prompt
+class QuestionAnsweringAgent(CodeAgent):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def __call__(self, question_text, file_name) -> str:
+        enhanced_question = get_prompt(question_text, file_name)
+        response = self.run(enhanced_question, reset=True)
+        return response

app.py CHANGED Viewed

@@ -1,115 +1,49 @@
-from math import e
 import os
 import gradio as gr
 import requests
 import pandas as pd
-import json
-import csv
-import openpyxl
-import whisper
-from prompt import get_prompt
-from huggingface_hub import login
 from smolagents import (
     InferenceClientModel,
-    FinalAnswerTool,
-    CodeAgent
 )
-from smolagents.tools import tool, Tool
-from smolagents.agents import PromptTemplates
-from wikipedia_tools import wikipedia_summary, read_wikipedia_page
-#login(token=os.environ["HF_TOKEN"])
-# --- Constants ---
-DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- model initialization ---
 model = InferenceClientModel(
     provider="auto",
-    model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",  # Changed to larger model
-    #timeout=30
 )
-final_answer = FinalAnswerTool()
-def _download_file(file_name: str) -> None:
-    if not os.path.exists(file_name):
-        url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
-        r = requests.get(url)
-        with open(file_name, "wb") as f:
-            f.write(r.content)
-@tool
-def read_file_as_text(file_name: str) -> str:
-    """
-    Opens a file and returns its content as readable text.
-    Supports 'txt', 'json', 'csv', 'xlsx', and 'mp3' (for mp3, it transcribes speech to text).
-    Args:
-        file_name (str): The path or name of the file.
-    Returns:
-        str: The content of the file as text, or transcribed speech if 'mp3'.
-    """
-    _download_file(file_name)
-    file_type = file_name.split(".")[-1]
-    try:
-        if file_type in {"txt", "py"}:
-            with open(file_name, "r", encoding="utf-8") as f:
-                return f.read()
-        elif file_type == "json":
-            with open(file_name, "r", encoding="utf-8") as f:
-                data = json.load(f)
-            return json.dumps(data, indent=2)
-        elif file_type == "csv":
-            with open(file_name, "r", encoding="utf-8") as f:
-                reader = csv.reader(f)
-                rows = list(reader)
-            return "\n".join([", ".join(row) for row in rows])
-        elif file_type == "xlsx":
-            wb = openpyxl.load_workbook(file_name, data_only=True)
-            sheet = wb.active
-            content = []
-            for row in sheet.iter_rows(values_only=True):
-                content.append(", ".join(str(cell) if cell is not None else "" for cell in row))
-            return "\n".join(content)
-        elif file_type == "mp3":
-            w = whisper.load_model("base")
-            res = w.transcribe(file_name)
-            return res["text"]
-        else:
-            return f"File type '{file_type}' not supported."
-    except FileNotFoundError:
-        return f"File '{file_name}' not found."
-    except Exception as e:
-        return f"Error opening file '{file_name}': {str(e)}"
-# --- Prompt templates ---
-class QuestionAnsweringAgent(CodeAgent):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def __call__(self, question_text, file_name) -> str:
-        enhanced_question = get_prompt(question_text, file_name)
-        response = self.run(enhanced_question, reset=True)
-        return response
-# Create agent with all the tools
 agent = QuestionAnsweringAgent(
     name="question_answering_expert",
     model=model,
-    tools=[final_answer, wikipedia_summary, read_wikipedia_page, read_file_as_text],
-    add_base_tools=False,  # Disable auto base tools to avoid overriding custom visit_webpage
     planning_interval=None,  # Disable planning to ensure immediate stop after final_answer
     additional_authorized_imports=["bs4"],
-    max_steps=5,
-    verbosity_level=2,  # Add for better debugging
-    #use_structured_outputs_internally=True # Enable structured output
 )
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -165,7 +99,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data[:1]:
         task_id = item.get("task_id")
         question_text = item.get("question")
         file_name = item.get("file_name")

 import os
 import gradio as gr
 import requests
 import pandas as pd
 from smolagents import (
     InferenceClientModel,
+    FinalAnswerTool
 )
+from tools.wikipedia_tools import wikipedia_summary, read_wikipedia_page
+from tools.file_tools import read_file_as_text
+from tools.youtube_tools import download_youtube_url_images, download_youtube_url_audio
+from tools.image_processing_tools import ask_question_about_image
+from agent import QuestionAnsweringAgent
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 model = InferenceClientModel(
     provider="auto",
+    model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
+    temperature=0,
+    top_p=1.0,
+    seed=42,
 )
+agent_tools = [
+    FinalAnswerTool(),
+    wikipedia_summary, read_wikipedia_page,
+    read_file_as_text,
+    download_youtube_url_images, download_youtube_url_audio,
+    ask_question_about_image
+]
 agent = QuestionAnsweringAgent(
     name="question_answering_expert",
     model=model,
+    tools=agent_tools,
+    add_base_tools=True, # Disable auto base tools to avoid overriding custom visit_webpage
     planning_interval=None,  # Disable planning to ensure immediate stop after final_answer
     additional_authorized_imports=["bs4"],
+    max_steps=10,
+    verbosity_level=2,  # For better debugging
 )
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
+    for item in questions_data[:]:
         task_id = item.get("task_id")
         question_text = item.get("question")
         file_name = item.get("file_name")

logs DELETED Viewed

@@ -1,221 +0,0 @@
-Launching Gradio Interface for Basic Agent Evaluation...
-* Running on local URL:  http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr_mode=False` in `launch()`)
-To create a public link, set `share=True` in `launch()`.
-User logged in: gabzer
-https://huggingface.co/spaces/gabzer/GAIA_benchmark_agent/tree/main
-Fetching questions from: https://agents-course-unit4-scoring.hf.space/questions
-Fetched 20 questions.
-Running agent on 20 questions...
-=== Starting agent run ===
-╭──────────────────── New run - question_answering_expert ─────────────────────╮
-│                                                                              │
-│ You are a highly precise question-answering agent.                           │
-│         When given a question:                                               │
-│         - If necessary, perform a wikipedia search using the                 │
-│ `wikipedia_search` tool to find possible sources of information. For the     │
-│ `query` parameter of the `wikipedia_search` tool, enter only the name of the │
-│ person, the place, or the event you want to search. Not something too long.  │
-│         - If necessary, perform a web search using the `web_search` tool to  │
-│ find possible sources of information.                                        │
-│         - If the web search only returns titles and short snippets, you MUST │
-│ visit the actual webpage using the `visit_webpage` tool to read the full     │
-│ content before answering.                                                    │
-│         - If the task requires reading, listening, or analyzing a file, you  │
-│ must use the file specified after the question, NOT the file name mentioned  │
-│ casually inside the question text.                                           │
-│         - Comma separated lists MUST contain a single space after each       │
-│ comma.                                                                       │
-│         - If you are asked for a number, don't use comma to write your       │
-│ number, nor use units such as $$ or percent sign unless specified otherwise. │
-│         - If you are asked for a string, don't use articles, nor             │
-│ abbreviations (e.g. for cities), and write the digits in plain text unless   │
-│ specified otherwise.                                                         │
-│         - If you are asked for a comma separated list, apply the above rules │
-│ depending of whether the element to be put in the list is a number or a      │
-│ string.                                                                      │
-│         - Only answer after you have gathered enough information by reading  │
-│ the actual page contents.                                                    │
-│         - Only answer after you have printed out the final answer first.     │
-│         - Once you have obtained the final answer, you MUST make a code call │
-│ as follows:                                                                  │
-│         <code>                                                               │
-│         final_answer("your_answer")                                          │
-│         </code>                                                              │
-│         to submit the final answer.                                          │
-│         - Do not retry or execute anything else after calling                │
-│ `final_answer`. STOP IMMEDIATELY.                                            │
-│         - Calling `final_answer` terminates the task completely. No further  │
-│ steps are needed.                                                            │
-│         - The function `final_answer` must wrap the exact printed value.     │
-│         - Provide ONLY the precise answer requested.                         │
-│         - Do not include explanations, steps, reasoning, or additional text  │
-│ when calling `final_answer`.                                                 │
-│         - Be direct and specific. The GAIA benchmark requires exactly        │
-│ matching answers.                                                            │
-│                                                                              │
-│         Based on the above guidelines, answer the following question:        │
-│         -- beginning of question --                                          │
-│         How many studio albums were published by Mercedes Sosa between 2000  │
-│ and 2009 (included)? You can use the latest 2022 version of english          │
-│ wikipedia.                                                                   │
-│         -- end of question --                                                │
-│         If the questions mentions the need to use a file, use the following  │
-│ `file_name` value below as the `file_name` parameter in any function calls:  │
-│         file_name:                                                           │
-│                                                                              │
-╰─ InferenceClientModel - Qwen/Qwen2.5-Coder-7B-Instruct ──────────────────────╯
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Output message of the LLM: ─────────────────────────────────────────────────────
-Thought: To find how many studio albums Mercedes Sosa published between 2000 and
-2009, I need to look up her discography on Wikipedia. This will give me the
-information I require.
-<code>
-sosa_discography = wikipedia_search("Mercedes Sosa discography")
-print(sosa_discography)
- ─ Executing parsed code: ─────────────────────────────────────────────────────
-  sosa_discography = wikipedia_search("Mercedes Sosa discography")
-  print(sosa_discography)
- ──────────────────────────────────────────────────────────────────────────────
-Execution logs:
-No Wikipedia page found for 'Mercedes Sosa discography'. Try a different query.
-Out: None
-[Step 1: Duration 2.08 seconds| Input tokens: 2,734 | Output tokens: 72]
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Output message of the LLM: ─────────────────────────────────────────────────────
-Thought: The previous query didn't yield any results. It could be due to the
-specificity or the lack of a recent entry. I'll broaden the query to search for
-"Mercedes Sosa career".
-Using broadened query for better results.
-<code>
-sosa_career = wikipedia_search("Mercedes Sosa career")
-print(sosa_career)
- ─ Executing parsed code: ─────────────────────────────────────────────────────
-  sosa_career = wikipedia_search("Mercedes Sosa career")
-  print(sosa_career)
- ──────────────────────────────────────────────────────────────────────────────
-Execution logs:
-No Wikipedia page found for 'Mercedes Sosa career'. Try a different query.
-Out: None
-[Step 2: Duration 1.96 seconds| Input tokens: 5,639 | Output tokens: 149]
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Output message of the LLM: ─────────────────────────────────────────────────────
-Thought: Searching for "Mercedes Sosa's album releases" may be more accurate
-since I am specifically looking for studio albums.
-<code>
-sosa_album_releases = wikipedia_search("Mercedes Sosa's album releases")
-print(sosa_album_releases)
- ─ Executing parsed code: ─────────────────────────────────────────────────���───
-  sosa_album_releases = wikipedia_search("Mercedes Sosa's album releases")
-  print(sosa_album_releases)
- ──────────────────────────────────────────────────────────────────────────────
-Execution logs:
-No Wikipedia page found for 'Mercedes Sosa's album releases'. Try a different
-query.
-Out: None
-[Step 3: Duration 1.82 seconds| Input tokens: 8,720 | Output tokens: 205]
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 4 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Output message of the LLM: ─────────────────────────────────────────────────────
-Thought: Given that previous searches were unsuccessful, let me perform a
-general web search for "Mercedes Sosa albums between 2000 and 2009" and examine
-the results for relevant information.
-<code>
-album_info = web_search("Mercedes Sosa albums between 2000 and 2009")
-print(album_info)
- ─ Executing parsed code: ─────────────────────────────────────────────────────
-  album_info = web_search("Mercedes Sosa albums between 2000 and 2009")
-  print(album_info)
- ──────────────────────────────────────────────────────────────────────────────
-Execution logs:
-## Search Results
-[Category: Mercedes Sosa albums -
-Wikipedia](https://en.wikipedia.org/wiki/Category:Mercedes_Sosa_albums)
-This is a set category. It should only contain pages that are Mercedes Sosa
-albums or lists of Mercedes Sosa albums , as well as subcategories containing
-those things (themselves set categories).
-[Mercedes Sosa Albums and
-Discography](https://genius.com/artists/Mercedes-sosa/albums)
-All Albums by Mercedes Sosa . Mercedes Sosa discography includes 45 albums .
-[Mercedes Sosa | Discografia |
-Discogs](https://www.discogs.com/it/artist/333361-Mercedes-Sosa)
-Mercedes Sosa , known as La Negra, (born July 9, 1935 in San Miguel de Tucuman,
-Argentina – Death October 4, 2009 in Buenos Aires) was an Argentine singer who
-was and remains immensely popular throughout Latin America and internationally.
-[Mercedes Sosa - Apple
-Music](https://music.apple.com/tc/artist/mercedes-sosa/122968)
-Mercedes Sosa . Latest Release. 24 NOV 2024. Mercedes Sosa en New York, 1974 ·
-2024. Palabras de Mercedes IX (En Vivo NY).
-[Mercedes Sosa : albums , chansons, concerts |
-Deezer](https://www.deezer.com/fr/artist/10606)
-Mercedes Sosa sur Deezer : discographie de l'artiste, top albums et chansons,
-concerts et featurings.
-[Mercedes Sosa on TIDAL](https://tidal.com/browse/artist/3501549)
-Albums . Mercedes Sosa en New York, 1974. Mercedes Sosa . 2009 . Zona de
-Promesas (with Gustavo Cerati).
-[Mercedes Sosa : Best songs, Albums and Concerts -
-Mozaart](https://mozaart.com/en/a/mercedes-sosa)
-Haydée Mercedes Sosa (Spanish pronunciation: [meɾˈseðes ˈsosa]; 9 July 1935 – 4
-October 2009 ), sometimes known as La Negra (literally: The Black One), was an
-Argentine singer who was popular throughout Latin America and many countries
-outsid.
-[Mercedes Sosa - Albums](https://muzikum.eu/en/mercedes-sosa/discography)
-Mercedes Sosa . 23 Albums found.Reyes Records (2003). Mercedes Sosa - A Arte De
-(bradypus Series).
-[Cantora 1 - Mercedes Sosa | Album |
-AllMusic](https://www.allmusic.com/album/cantora-1-mw0000827785)
-Cantora 1 by Mercedes Sosa released in 2009 . Find album reviews, track lists,
-credits, awards and more at AllMusic.
-[Mercedes Sosa Albums , Songs - Discography - Album of The
-Year](https://www.albumoftheyear.org/artist/33792-mercedes-sosa/?type=live)
-Information on Mercedes Sosa . Complete discography, ratings, reviews and
-more.1982. Mercedes Sosa En Argentina. Live.
-Out: None
-[Step 4: Duration 3.07 seconds| Input tokens: 11,961 | Output tokens: 283]
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 5 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Output message of the LLM: ──────────────────────────────��──────────────────────
-Thought: After reviewing the web search results, it is apparent that there is
-limited information available about Mercedes Sosa releasing albums specifically
-between 2000 and 2009. The most relevant result indicates a live album released
-in 1982 and a song release in 2009, but not studio albums in this range.
-As no specific information about the number ofstudio albums released by Mercedes
-Sosa between 2000 and 2009 was found, I cannot determine the exact count.
-Therefore, I will finalize my answer as "unknown."
-<code>
-final_answer("unknown")
- ─ Executing parsed code: ─────────────────────────────────────────────────────
-  final_answer("unknown")
- ──────────────────────────────────────────────────────────────────────────────
-Final answer: unknown
-[Step 5: Duration 2.95 seconds| Input tokens: 16,092 | Output tokens: 414]
->>> Agent returned: unknown
-Task 8e867cd7-cff9-4e6c-867a-ff5ddc2550be execution steps: 6
-  Step 1: TaskStep
-  Step 2: ActionStep
-  Step 3: ActionStep
-  Step 4: ActionStep
-  Step 5: ActionStep
-  Step 6: ActionStep
-Agent finished. Submitting 1 answers for user 'gabzer'...
-Submitting 1 answers to: https://agents-course-unit4-scoring.hf.space/submit
-Submission Failed: Server responded with status 422. Detail: [{'type': 'string_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'str'], 'msg': 'Input should be a valid string', 'input': None}, {'type': 'int_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'int'], 'msg': 'Input should be a valid integer', 'input': None}, {'type': 'float_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'float'], 'msg': 'Input should be a valid number', 'input': None}]

prompt.py CHANGED Viewed

@@ -11,6 +11,7 @@ def get_prompt(question_text, file_name):
             - "Battle of Hastings timeline" → use: wikipedia_summary("Battle of Hastings")
             - "Population of Paris in 2010" → use: wikipedia_summary("Paris")
         - If necessary, visit the wikipedia page listed in the wikipedia summary tool to read the full content. You will find the page url in the output of the wikipedia summary tool at the end after the **Read more:** section. Use the `read_wikipedia_page` tool to visit the page.
         - If necessary, perform a web search using the `web_search` tool to find possible sources of information.
         - If the web search only returns titles and short snippets, you MUST visit the actual webpage using the `read_wikipedia_page` tool to read the full content before answering.
         - If the task requires reading, listening, or analyzing a file, you must use the file specified after the question, NOT the file name mentioned casually inside the question text.

             - "Battle of Hastings timeline" → use: wikipedia_summary("Battle of Hastings")
             - "Population of Paris in 2010" → use: wikipedia_summary("Paris")
         - If necessary, visit the wikipedia page listed in the wikipedia summary tool to read the full content. You will find the page url in the output of the wikipedia summary tool at the end after the **Read more:** section. Use the `read_wikipedia_page` tool to visit the page.
+        - When using the `read_wikipedia_page` tool, you may find tables in the page. To analyze the tables, please use a code snippet to read the tables into a pandas dataframe and analyze the data.
         - If necessary, perform a web search using the `web_search` tool to find possible sources of information.
         - If the web search only returns titles and short snippets, you MUST visit the actual webpage using the `read_wikipedia_page` tool to read the full content before answering.
         - If the task requires reading, listening, or analyzing a file, you must use the file specified after the question, NOT the file name mentioned casually inside the question text.

requirements.txt CHANGED Viewed

@@ -3,12 +3,12 @@ requests==2.32.5
 smolagents==1.21.3
 duckduckgo-search==8.1.1
 ddgs==9.5.5
-requests==2.32.5
 markdownify==0.11.0
 openpyxl==3.1.5
 wikipedia-api==0.8.1
-whisper==1.1.10
 beautifulsoup4==4.12.3
 langchain_community==0.3.2
 wikipedia==1.4.0
-tabulate==0.9.0

 smolagents==1.21.3
 duckduckgo-search==8.1.1
 ddgs==9.5.5
 markdownify==0.11.0
 openpyxl==3.1.5
 wikipedia-api==0.8.1
+openai-whisper==20250625
 beautifulsoup4==4.12.3
 langchain_community==0.3.2
 wikipedia==1.4.0
+tabulate==0.9.0
+yt-dlp==2025.9.5

test.ipynb DELETED Viewed

@@ -1,262 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "289bbe12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from smolagents import WikipediaSearchTool \n",
-    "wikipedia_search = WikipediaSearchTool(\n",
-    "    user_agent=f\"My research agent ([email protected])\",\n",
-    "    language=\"en\",\n",
-    "    content_type=\"text\",\n",
-    "    extract_format=\"HTML\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "0e69fd46",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'summary': '\\n        The sections inside the page are Life, Awards, Death, Tributes, Discography, Filmography, References, Further reading, External links and the summary of the page is Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an Argentine singer who was popular throughout Latin America and many countries outside the region. With her roots in Argentine folk music, Sosa became one of the preeminent exponents of El nuevo cancionero. She gave voice to songs written by many Latin American songwriters. Her music made people hail her as the \"voice of the voiceless ones\". She was often called \"the conscience of Latin America\".\\nSosa performed in venues such as the Lincoln Center in New York City, the Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as sold-out shows in New York\\'s Carnegie Hall and the Roman Colosseum during her final decade of life. Her career spanned four decades and she was the recipient of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000, the main musical award in Argentina. She served as an ambassador for UNICEF.\\n        ',\n",
-       " 'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa'}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from wikipedia_tools import wikipedia_summary, read_wikipedia_page\n",
-    "url = \"https://en.wikipedia.org/wiki/Mercedes_Sosa\"\n",
-    "query = \"Mercedes Sosa\"\n",
-    "wikipedia_summary(query=query)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "034ffd47",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'summary': 'Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an Argentine singer who was popular throughout Latin America and many countries outside the region. With her roots in Argentine folk music, Sosa became one of the preeminent exponents of El nuevo cancionero. She gave voice to songs written by many Latin American songwriters. Her music made people hail her as the \"voice of the voiceless ones\". She was often called \"the conscience of Latin America\".\\nSosa performed in venues such as the Lincoln Center in New York City, the Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as sold-out shows in New York\\'s Carnegie Hall and the Roman Colosseum during her final decade of life. Her career spanned four decades and she was the recipient of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000, the main musical award in Argentina. She served as an ambassador for UNICEF.',\n",
-       " 'url': 'https://en.wikipedia.org/wiki/Mercedes Sosa'}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import wikipediaapi\n",
-    "import os\n",
-    "query=\"Mercedes Sosa\"\n",
-    "\"\"\"\n",
-    "Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.\n",
-    "Args:\n",
-    "    query: The query to search for.\n",
-    "Returns:\n",
-    "    A dictionary with the summary of the page and the url of the page.\n",
-    "\"\"\"\n",
-    "summary_tool = wikipediaapi.Wikipedia(\n",
-    "    user_agent=f\"My research agent ({os.getenv('USER_EMAIL')})\",\n",
-    "    extra_api_params={\"include\": \"url\"}\n",
-    ")\n",
-    "page = summary_tool.page(query)\n",
-    "if not page.exists():\n",
-    "    raise ValueError(f\"No Wikipedia page found for '{query}'. Try a different query.\")\n",
-    "{\n",
-    "    \"summary\": page.summary,\n",
-    "    \"url\": f\"https://en.wikipedia.org/wiki/{page.title}\"\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "9992b1ec",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Mercedes Sosa (lang: en, variant: None, id: 476992, ns: 0)"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "page"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "11d4a9f8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'https://en.wikipedia.org/wiki/Mercedes_Sosa'"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import wikipedia\n",
-    "ny = wikipedia.page(pageid=476992)\n",
-    "ny.url\n",
-    "# u'http://en.wikipedia.org/wiki/New_York'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "577562bd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'pageid': '476992',\n",
-       " 'title': 'Mercedes Sosa',\n",
-       " 'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa',\n",
-       " '_sections': [],\n",
-       " '_categories': ['1935 births',\n",
-       "  '2009 deaths',\n",
-       "  '20th-century Argentine women singers',\n",
-       "  '20th-century drummers',\n",
-       "  'All articles with dead external links',\n",
-       "  'All articles with unsourced statements',\n",
-       "  'Argentine activists',\n",
-       "  'Argentine people of Diaguita descent',\n",
-       "  'Argentine people of French descent',\n",
-       "  'Argentine people of Quechua descent',\n",
-       "  'Argentine women activists',\n",
-       "  'Articles with Brazilian Portuguese-language sources (pt-br)',\n",
-       "  'Articles with German-language sources (de)',\n",
-       "  'Articles with Spanish-language sources (es)',\n",
-       "  'Articles with dead external links from June 2024',\n",
-       "  'Articles with hCards',\n",
-       "  'Articles with short description',\n",
-       "  'Articles with unsourced statements from December 2023',\n",
-       "  'Bombo legüero players',\n",
-       "  'CS1 Spanish-language sources (es)',\n",
-       "  'Commons category link is on Wikidata',\n",
-       "  'Deaths from kidney failure in Argentina',\n",
-       "  'Latin Grammy Award winners',\n",
-       "  'Latin Grammy Lifetime Achievement Award winners',\n",
-       "  'Nueva canción musicians',\n",
-       "  'People from San Miguel de Tucumán',\n",
-       "  'Recipients of the Order of Cultural Merit (Brazil)',\n",
-       "  'Short description is different from Wikidata',\n",
-       "  'Use dmy dates from July 2025',\n",
-       "  'Webarchive template wayback links',\n",
-       "  'Wikipedia indefinitely move-protected pages',\n",
-       "  'Women in Latin music']}"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ny.__dict__"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "9fa24efc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import wikipediaapi\n",
-    "import os\n",
-    "query=\"Mercedes Sosa\"\n",
-    "summary_tool = wikipediaapi.Wikipedia(\n",
-    "    user_agent=f\"My research agent ({os.getenv('USER_EMAIL')})\",\n",
-    ")\n",
-    "page = summary_tool.page(query)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "8589fd42",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Life',\n",
-       " 'Awards',\n",
-       " 'Death',\n",
-       " 'Tributes',\n",
-       " 'Discography',\n",
-       " 'Filmography',\n",
-       " 'References',\n",
-       " 'Further reading',\n",
-       " 'External links']"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "[section._title for section in page.sections]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "38a8b06c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "agents_env",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.13.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

test_executed.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

tools.py DELETED Viewed

File without changes

tools/file_tools.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import json
+import csv
+import openpyxl
+import whisper
+import os
+import requests
+from smolagents.tools import tool
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+def _download_file(file_name: str) -> None:
+    if not os.path.exists(file_name):
+        url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
+        r = requests.get(url)
+        with open(file_name, "wb") as f:
+            f.write(r.content)
+@tool
+def read_file_as_text(file_name: str) -> str:
+    """
+    Opens a file and returns its content as readable text.
+    Supports 'txt', 'json', 'csv', 'xlsx', and 'mp3' (for mp3, it transcribes speech to text).
+    Args:
+        file_name (str): The path or name of the file.
+    Returns:
+        str: The content of the file as text, or transcribed speech if 'mp3'.
+    """
+    _download_file(file_name)
+    file_type = file_name.split(".")[-1]
+    try:
+        if file_type in {"txt", "py"}:
+            with open(file_name, "r", encoding="utf-8") as f:
+                return f.read()
+        elif file_type == "json":
+            with open(file_name, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            return json.dumps(data, indent=2)
+        elif file_type == "csv":
+            with open(file_name, "r", encoding="utf-8") as f:
+                reader = csv.reader(f)
+                rows = list(reader)
+            return "\n".join([", ".join(row) for row in rows])
+        elif file_type == "xlsx":
+            wb = openpyxl.load_workbook(file_name, data_only=True)
+            sheet = wb.active
+            content = []
+            for row in sheet.iter_rows(values_only=True):
+                content.append(", ".join(str(cell) if cell is not None else "" for cell in row))
+            return "\n".join(content)
+        elif file_type == "mp3":
+            w = whisper.load_model("base")
+            res = w.transcribe(file_name)
+            return res["text"]
+        else:
+            return f"File type '{file_type}' not supported."
+    except FileNotFoundError:
+        return f"File '{file_name}' not found."
+    except Exception as e:
+        return f"Error opening file '{file_name}': {str(e)}"

tools/image_processing_tools.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from PIL import Image
+from transformers import pipeline
+from smolagents.tools import tool
+@tool
+def ask_question_about_image(question: str, path_to_image: str) -> str:
+    """
+    Ask a question about an image and return the answer.
+    Args:
+        question: the question to ask about the image.
+        path_to_image: The path to the image to ask the question about.
+    Returns:
+        A string with the answer to the question.
+    """
+    pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+    image = Image.open(fp=path_to_image)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    outputs = pipe(text=messages, max_new_tokens=60, return_full_text=False)
+    return outputs[0]["generated_text"]

wikipedia_tools.py → tools/wikipedia_tools.py RENAMED Viewed

@@ -5,6 +5,8 @@ import pandas as pd
 from bs4 import BeautifulSoup
 from smolagents.tools import tool
 import wikipediaapi
 def fetch_wikipedia_page(url: str) -> str:
     """Fetch raw HTML of a Wikipedia page."""
     headers = {
@@ -30,26 +32,68 @@ def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
     excluded = {_normalize_title(t) for t in titles}
     header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
-    for header in list(soup.find_all(header_tags)):
         title_text = _normalize_title(header.get_text(" ", strip=True))
         if title_text in excluded:
-            level = int(header.name[1])
-            nodes_to_remove = [header]
-            for sibling in list(header.next_siblings):
-                sib_name = getattr(sibling, "name", None)
-                if sib_name in header_tags:
-                    next_level = int(sibling.name[1])
-                    if next_level <= level:
-                        break
-                nodes_to_remove.append(sibling)
-            for node in nodes_to_remove:
                 try:
-                    node.decompose()
                 except Exception:
-                    try:
-                        node.extract()
-                    except Exception:
-                        pass
 def _cleanup_non_content(root: BeautifulSoup) -> None:
@@ -85,20 +129,73 @@ def _cleanup_non_content(root: BeautifulSoup) -> None:
 def extract_text(soup: BeautifulSoup) -> str:
-    """Extract main text (paragraphs + headers + lists) from article body only."""
     content_root = soup.select_one("div.mw-parser-output") or soup
     for elem in content_root(["script", "style", "sup", "aside", "nav"]):
         elem.decompose()
     _cleanup_non_content(content_root)
-    headers = [h.get_text(" ", strip=True) for h in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
-    paragraphs = [p.get_text(" ", strip=True) for p in content_root.find_all("p")]
-    lists = [li.get_text(" ", strip=True) for li in content_root.find_all("li")]
-    all_text = headers + paragraphs + lists
-    clean = [t for t in all_text if len(t) > 30]
-    return "\n\n".join(clean)
 def extract_tables(soup: BeautifulSoup) -> list[dict]:
@@ -146,13 +243,12 @@ def extract_tables(soup: BeautifulSoup) -> list[dict]:
     return tables
-def format_for_llm(text: str, tables: list[dict], spec: dict) -> str:
     """Combine text + tables into a single string for LLM input."""
     output = []
     output.append("=== ARTICLE TEXT ===\n")
     output.append(text)
-    sections_to_exclude = spec.get("sections_to_exclude", [])
     excluded = {_normalize_title(s) for s in sections_to_exclude}
     filtered_tables = [
         t for t in tables if _normalize_title(t.get("name", "")) not in excluded
@@ -191,26 +287,23 @@ def wikipedia_summary(entity: str) -> dict:
     }
 @tool
 def read_wikipedia_page(
     url: str,
-    spec: dict = {
-        "sections_to_exclude": [
-            "External links",
-            "References",
-            "Further reading",
-            "See also",
-            "Notes",
-        ]
-    }) -> str:
     """
-    Read a Wikipedia page and return a dictionary with the text of the page.
     Args:
         url: The URL of the Wikipedia page to read.
-        spec: A dictionary specifying sections to exclude from the page.
     Returns:
-        A dictionary with the text of the page.
     """
     if "https://en.wikipedia.org/wiki/" not in url:
         raise ValueError("URL is required")
@@ -219,12 +312,12 @@ def read_wikipedia_page(
     # Parse the page
     soup = BeautifulSoup(html, "html.parser")
     # Remove unwanted sections
-    _remove_sections_by_titles(soup, spec.get("sections_to_exclude", []))
     # Extract after pruning unwanted sections
     text = extract_text(soup)
     tables = extract_tables(soup)
     # Combine
-    llm_ready = format_for_llm(text, tables, spec)
     return llm_ready

 from bs4 import BeautifulSoup
 from smolagents.tools import tool
 import wikipediaapi
 def fetch_wikipedia_page(url: str) -> str:
     """Fetch raw HTML of a Wikipedia page."""
     headers = {
     excluded = {_normalize_title(t) for t in titles}
     header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
+    # Find all headers that match excluded titles
+    headers_to_remove = []
+    for header in soup.find_all(header_tags):
         title_text = _normalize_title(header.get_text(" ", strip=True))
         if title_text in excluded:
+            headers_to_remove.append(header)
+    # Remove each matching section (header + content)
+    for header in headers_to_remove:
+        # Skip if header was already removed as part of another section
+        if not header.parent:
+            continue
+        level = int(header.name[1])
+        # Determine the container to remove - could be the header itself or its parent wrapper
+        header_container = header
+        # If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
+        if (header.parent and
+            header.parent.name == 'div' and
+            header.parent.get('class') and
+            any('heading' in cls.lower() for cls in header.parent.get('class', []))):
+            header_container = header.parent
+        nodes_to_remove = [header_container]
+        # Collect all content after the header container until next header of same/higher level
+        current = header_container
+        while current.next_sibling:
+            current = current.next_sibling
+            sib_name = getattr(current, "name", None)
+            # If we hit another header (directly or within a heading container), check its level
+            next_header = None
+            if sib_name in header_tags:
+                next_header = current
+            elif (sib_name == 'div' and
+                  current.get('class') and
+                  any('heading' in cls.lower() for cls in current.get('class', []))):
+                # This is a heading container, find the header inside it
+                for child in current.find_all(header_tags):
+                    next_header = child
+                    break
+            if next_header:
+                next_level = int(next_header.name[1])
+                if next_level <= level:
+                    # This is a header of same or higher level - stop here
+                    break
+            # Add this node to removal list
+            nodes_to_remove.append(current)
+        # Remove all collected nodes
+        for node in nodes_to_remove:
+            try:
+                node.decompose()
+            except Exception:
                 try:
+                    node.extract()
                 except Exception:
+                    pass
 def _cleanup_non_content(root: BeautifulSoup) -> None:
 def extract_text(soup: BeautifulSoup) -> str:
+    """Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
+    Excludes content that's inside tables and excludes headers that are also used as
+    table names (either as <caption> or the nearest previous header) to avoid duplication
+    with extract_tables."""
     content_root = soup.select_one("div.mw-parser-output") or soup
     for elem in content_root(["script", "style", "sup", "aside", "nav"]):
         elem.decompose()
     _cleanup_non_content(content_root)
+    # Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
+    table_names_normalized = set()
+    for table in content_root.find_all("table"):
+        # Skip non-content tables (same logic as extract_tables)
+        classes = table.get("class", [])
+        if isinstance(classes, list) and any(
+            c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
+            for c in classes
+        ):
+            continue
+        name_text = None
+        caption_el = table.find("caption")
+        if caption_el:
+            caption_text = caption_el.get_text(" ", strip=True)
+            if caption_text:
+                name_text = caption_text
+            else:
+                # Empty caption: treat as no caption and fallback to previous header
+                prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
+                if prev_header:
+                    name_text = prev_header.get_text(" ", strip=True)
+        else:
+            prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
+            if prev_header:
+                name_text = prev_header.get_text(" ", strip=True)
+        if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
+            name_text = "Infobox"
+        if name_text:
+            table_names_normalized.add(_normalize_title(name_text))
+    # Find all text elements in document order, but exclude duplicates
+    text_elements = []
+    for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
+        # Skip elements that are inside a table (to avoid duplication with extract_tables)
+        if element.find_parent("table"):
+            continue
+        # Skip headers that match any table name (to avoid duplication with extract_tables)
+        if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
+            header_text_norm = _normalize_title(element.get_text(" ", strip=True))
+            if header_text_norm in table_names_normalized:
+                continue
+        # Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
+        if element.name == "li":
+            li_text_norm = _normalize_title(element.get_text(" ", strip=True))
+            if li_text_norm in table_names_normalized:
+                continue
+        text = element.get_text(" ", strip=True)
+        if text:  # Only include non-empty text
+            text_elements.append(text)
+    return "\n\n".join(text_elements)
 def extract_tables(soup: BeautifulSoup) -> list[dict]:
     return tables
+def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
     """Combine text + tables into a single string for LLM input."""
     output = []
     output.append("=== ARTICLE TEXT ===\n")
     output.append(text)
     excluded = {_normalize_title(s) for s in sections_to_exclude}
     filtered_tables = [
         t for t in tables if _normalize_title(t.get("name", "")) not in excluded
     }
 @tool
 def read_wikipedia_page(
     url: str,
+    sections_to_exclude: list[str] = [
+        "External links",
+        "References",
+        "Further reading",
+        "See also",
+        "Notes",
+    ]) -> str:
     """
+    Read a Wikipedia page and return a string with the text of the page.
     Args:
         url: The URL of the Wikipedia page to read.
+        sections_to_exclude: A list of sections to exclude from the page.
     Returns:
+        A string with the text of the page.
     """
     if "https://en.wikipedia.org/wiki/" not in url:
         raise ValueError("URL is required")
     # Parse the page
     soup = BeautifulSoup(html, "html.parser")
     # Remove unwanted sections
+    _remove_sections_by_titles(soup, sections_to_exclude)
     # Extract after pruning unwanted sections
     text = extract_text(soup)
     tables = extract_tables(soup)
     # Combine
+    llm_ready = format_for_llm(text, tables, sections_to_exclude)
     return llm_ready

tools/youtube_tools.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+import subprocess
+from yt_dlp import YoutubeDL
+from smolagents.tools import tool
+# Use FFmpeg to extract frames from the video
+def extract_frames_with_ffmpeg(video_path: str, num_frames: int) -> [str]:
+    """Extract frames from video using FFmpeg"""
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    # Get video duration using ffprobe
+    duration_cmd = [
+        'ffprobe', '-v', 'quiet', '-print_format', 'json',
+        '-show_format', video_path
+    ]
+    try:
+        result = subprocess.run(duration_cmd, capture_output=True, text=True, check=True)
+        import json
+        metadata = json.loads(result.stdout)
+        duration = float(metadata['format']['duration'])
+        # Calculate time intervals for frame extraction
+        time_intervals = [duration * i / (num_frames + 1) for i in range(1, num_frames + 1)]
+        extracted_files = []
+        for i, time_pos in enumerate(time_intervals):
+            output_filename = f"{os.path.splitext(os.path.basename(video_path))[0]}_frame_{i+1:03d}.jpg"
+            # Extract frame at specific time
+            ffmpeg_cmd = [
+                'ffmpeg', '-i', video_path, '-ss', str(time_pos),
+                '-vframes', '1', '-q:v', '2', '-y', output_filename
+            ]
+            subprocess.run(ffmpeg_cmd, capture_output=True, check=True)
+            extracted_files.append(output_filename)
+        return extracted_files
+    except subprocess.CalledProcessError as e:
+        print(f"Error running FFmpeg: {e}")
+        return []
+    except Exception as e:
+        print(f"Error: {e}")
+        return []
+@tool
+def download_youtube_url_audio(url: str) -> str:
+    """
+    Download a YouTube video and return the path to the downloaded file.
+    Args:
+        url (str): The URL of the YouTube video to download.
+    Returns:
+        str: The path to the downloaded file.
+    """
+    ydl_audio_opts = {
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'quiet': True,
+        'no_verbose_header': True,
+        'no_warnings': True,
+    }
+    with YoutubeDL(ydl_audio_opts) as ydl:
+        file_path = ydl.extract_info(url)
+    return file_path['requested_downloads'][0]['filepath']
+@tool
+def download_youtube_url_images(url: str, num_images: int = 3) -> str:
+    """
+        Download a YouTube video and return the path to the downloaded file.
+        Args:
+            url (str): The URL of the YouTube video to download.
+            num_images (int): The number of images to download.
+        Returns:
+            str: The different paths to the downloaded files, separated by newlines.
+    """
+    # First, download the video
+    ydl_images_opts = {
+        'format': 'best[height<=720]',  # Download video in reasonable quality
+        'outtmpl': '%(title)s.%(ext)s',  # Save with title as filename
+        'quiet': True,
+        'no_verbose_header': True,
+        'no_warnings': True,
+    }
+    with YoutubeDL(ydl_images_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        video_filepath = ydl.prepare_filename(info)
+    # Extract frames from the downloaded video
+    if os.path.exists(video_filepath):
+        extracted_frames = extract_frames_with_ffmpeg(video_filepath, num_images)
+        return "\n".join(extracted_frames)
+    return ""

wiki_extractor.py DELETED Viewed

@@ -1,341 +0,0 @@
-import re
-import sys
-import json
-from typing import Any, Dict, List, Optional
-import requests
-from bs4 import BeautifulSoup, Tag
-from markdownify import markdownify as md
-USER_AGENT = (
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
-    "AppleWebKit/537.36 (KHTML, like Gecko) "
-    "Chrome/126.0.0.0 Safari/537.36"
-)
-def _clean_text(text: str) -> str:
-    if not text:
-        return ""
-    # Remove reference markers like [1], [a], [note 1]
-    text = re.sub(r"\s*\[[^\]]+\]", "", text)
-    # Collapse whitespace
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-def _previous_heading(element: Tag) -> Optional[str]:
-    node = element
-    while node is not None:
-        node = node.previous_sibling
-        if isinstance(node, Tag) and node.name in {"h2", "h3", "h4", "h5", "h6"}:
-            return _clean_text(node.get_text(" ", strip=True))
-    # Fallback: walk up to find any earlier heading in parent
-    parent = element.parent
-    while parent is not None and isinstance(parent, Tag):
-        sib = parent.previous_sibling
-        while sib is not None:
-            if isinstance(sib, Tag) and sib.name in {"h2", "h3", "h4", "h5", "h6"}:
-                return _clean_text(sib.get_text(" ", strip=True))
-            sib = sib.previous_sibling
-        parent = parent.parent
-    return None
-def _parse_table(table: Tag) -> Dict[str, Any]:
-    # Determine title/caption
-    title = None
-    caption = table.find("caption")
-    if caption:
-        title = _clean_text(caption.get_text(" ", strip=True))
-    if not title:
-        title = _previous_heading(table)
-    # Skip navboxes or non-content tables
-    classes = set(table.get("class", []))
-    if any(c in {"navbox", "vertical-navbox", "metadata", "mbox"} for c in classes):
-        return {
-            "name": title or "",
-            "headers": [],
-            "rows": [],
-            "skipped": True,
-        }
-    # Identify header cells
-    headers: List[str] = []
-    header_row = None
-    thead = table.find("thead")
-    if thead:
-        header_row = thead.find("tr")
-    if header_row is None:
-        # Find first row that contains any <th>
-        for tr in table.find_all("tr", recursive=True):
-            if tr.find("th"):
-                header_row = tr
-                break
-    if header_row is not None:
-        for th in header_row.find_all(["th", "td"], recursive=False):
-            header_text = _clean_text(th.get_text(" ", strip=True))
-            if header_text:
-                headers.append(header_text)
-    # Collect rows
-    rows: List[Any] = []
-    for tr in table.find_all("tr", recursive=True):
-        if tr is header_row:
-            continue
-        cells = tr.find_all(["td", "th"], recursive=False)
-        if not cells:
-            continue
-        values = [_clean_text(c.get_text(" ", strip=True)) for c in cells]
-        # If headers exist and lengths match, map to dict; else keep as list
-        if headers and len(values) == len(headers):
-            rows.append({headers[i]: values[i] for i in range(len(headers))})
-        else:
-            rows.append(values)
-    return {
-        "name": title or "",
-        "headers": headers,
-        "rows": rows,
-    }
-def extract_wikipedia_content(url: str) -> Dict[str, Any]:
-    resp = requests.get(
-        url,
-        headers={"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"},
-        timeout=30,
-    )
-    resp.raise_for_status()
-    soup = BeautifulSoup(resp.text, "html.parser")
-    title_tag = soup.find("h1", id="firstHeading")
-    title = _clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
-    # Main content
-    content = soup.select_one("#mw-content-text .mw-parser-output")
-    if content is None:
-        content = soup.find("div", class_="mw-parser-output") or soup
-    # Remove non-content elements
-    for selector in [
-        "table.navbox",
-        "table.vertical-navbox",
-        "div.reflist",
-        "ol.references",
-        "span.mw-editsection",
-        "script",
-        "style",
-        "div.mw-authority-control",
-        "div.navbox",
-        "table.metadata",
-        "table.toccolours",
-        "div.mw-references-wrap",
-        "sup.reference",
-    ]:
-        for node in content.select(selector):
-            node.decompose()
-    # Extract tables (keep real tables) by walking all descendants to capture nearest heading context
-    tables: List[Dict[str, Any]] = []
-    current_heading: Optional[str] = None
-    for node in content.descendants:
-        if not isinstance(node, Tag):
-            continue
-        if node.name in {"h2", "h3", "h4", "h5", "h6"}:
-            headline = node.find("span", class_="mw-headline")
-            heading_text = headline.get_text(" ", strip=True) if headline else node.get_text(" ", strip=True)
-            current_heading = _clean_text(heading_text)
-            continue
-        if node.name == "table":
-            classes = set(node.get("class", []))
-            if not classes or any(c in {"wikitable", "infobox", "sortable", "vevent"} for c in classes):
-                parsed = _parse_table(node)
-                if parsed.get("rows"):
-                    if not parsed.get("name") and current_heading:
-                        parsed["name"] = current_heading
-                    tables.append({k: v for k, v in parsed.items() if k != "skipped"})
-    # Extract text markdown excluding tables
-    # Clone by stringifying and re-parsing only the content, then drop tables
-    content_clone = BeautifulSoup(str(content), "html.parser")
-    for tbl in content_clone.find_all("table"):
-        tbl.decompose()
-    text_markdown = md(str(content_clone), strip=['img'])
-    text_markdown = _clean_text(text_markdown)
-    return {
-        "title": title,
-        "url": url,
-        "text_markdown": text_markdown,
-        "tables": tables,
-    }
-def _escape_markdown_cell(value: Any) -> str:
-    """Escape characters that break Markdown tables and normalize whitespace."""
-    if value is None:
-        return ""
-    text = str(value)
-    text = text.replace("|", "\\|")
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-def format_tables_as_markdown(
-    tables: List[Dict[str, Any]],
-    max_tables: Optional[int] = None,
-    max_rows_per_table: int = 25,
-) -> str:
-    """
-    Convert extracted tables into compact Markdown tables.
-    Args:
-        tables: List of table dicts as returned by extract_wikipedia_content.
-        max_tables: If set, include at most this many tables (in order).
-        max_rows_per_table: Maximum number of data rows to include per table.
-    Returns:
-        A Markdown string representing the tables.
-    """
-    if not tables:
-        return ""
-    rendered_sections: List[str] = []
-    selected = tables if max_tables is None else tables[: max_tables]
-    for table_idx, table in enumerate(selected):
-        name = table.get("name") or f"Table {table_idx + 1}"
-        headers: List[str] = table.get("headers", [])
-        rows: List[Any] = table.get("rows", [])
-        if not rows:
-            continue
-        section_lines: List[str] = []
-        section_lines.append(f"### Table: {name}")
-        # If we have headers and row dicts/lists, render a markdown table
-        if headers:
-            # Header row
-            escaped_headers = [_escape_markdown_cell(h) for h in headers]
-            section_lines.append("| " + " | ".join(escaped_headers) + " |")
-            section_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
-            # Data rows
-            for r_idx, row in enumerate(rows[: max_rows_per_table]):
-                if isinstance(row, dict):
-                    values = [_escape_markdown_cell(row.get(h, "")) for h in headers]
-                else:
-                    # row is a list; align to headers length
-                    values = [_escape_markdown_cell(row[i] if i < len(row) else "") for i in range(len(headers))]
-                section_lines.append("| " + " | ".join(values) + " |")
-        else:
-            # No headers: render as bullet list with row previews
-            for r_idx, row in enumerate(rows[: max_rows_per_table]):
-                if isinstance(row, dict):
-                    preview = ", ".join(f"{_escape_markdown_cell(k)}: {_escape_markdown_cell(v)}" for k, v in row.items())
-                else:
-                    preview = ", ".join(_escape_markdown_cell(v) for v in row)
-                section_lines.append(f"- {preview}")
-        # Indicate truncation if applicable
-        if len(rows) > max_rows_per_table:
-            section_lines.append(f"… ({len(rows) - max_rows_per_table} more rows omitted)")
-        rendered_sections.append("\n".join(section_lines))
-    return "\n\n".join(rendered_sections)
-def format_extracted_content(
-    data: Dict[str, Any],
-    include_url: bool = True,
-    max_tables: Optional[int] = None,
-    max_rows_per_table: int = 25,
-) -> str:
-    """
-    Combine `text_markdown` and `tables` from extract_wikipedia_content into an LLM-friendly Markdown string.
-    Args:
-        data: Dict returned by extract_wikipedia_content.
-        include_url: Whether to include the source URL at the top.
-        max_tables: If set, include at most this many tables.
-        max_rows_per_table: Maximum number of data rows per table.
-    Returns:
-        Markdown string ready to feed into an LLM.
-    """
-    if not data:
-        return ""
-    title = data.get("title") or ""
-    url = data.get("url") or ""
-    text_md = data.get("text_markdown") or ""
-    tables = data.get("tables") or []
-    parts: List[str] = []
-    if title:
-        parts.append(f"# {title}")
-    if include_url and url:
-        parts.append(f"Source: {url}")
-    if text_md:
-        parts.append("## Article")
-        parts.append(text_md)
-    tables_md = format_tables_as_markdown(tables, max_tables=max_tables, max_rows_per_table=max_rows_per_table)
-    if tables_md:
-        parts.append("## Tables")
-        parts.append(tables_md)
-    return "\n\n".join(p for p in parts if p)
-def main() -> None:
-    if len(sys.argv) < 2:
-        print("Usage: python wiki_extractor.py <wikipedia_url>")
-        sys.exit(1)
-    url = sys.argv[1]
-    data = extract_wikipedia_content(url)
-    print(json.dumps({
-        "title": data["title"],
-        "url": data["url"],
-        "num_tables": len(data["tables"]),
-        "table_names": [t.get("name", "") for t in data["tables"]][:20],
-    }, ensure_ascii=False, indent=2))
-    # Try to locate Studio albums table and print first 3 rows
-    studio_tables = [
-        t for t in data["tables"]
-        if "studio albums" in (t.get("name", "").lower())
-           or any("studio albums" in (cap.lower()) for cap in [t.get("name", "")])
-    ]
-    if studio_tables:
-        t0 = studio_tables[0]
-        print("\nFound 'Studio albums' table. Headers:")
-        print(t0.get("headers", []))
-        print("First 3 rows:")
-        for row in t0.get("rows", [])[:3]:
-            print(row)
-    else:
-        # Heuristic: print first wikitable under any heading that contains 'albums'
-        albums_like = [
-            t for t in data["tables"] if "albums" in t.get("name", "").lower()
-        ]
-        if albums_like:
-            t0 = albums_like[0]
-            print("\nFound albums-related table. Headers:")
-            print(t0.get("headers", []))
-            print("First 3 rows:")
-            for row in t0.get("rows", [])[:3]:
-                print(row)
-if __name__ == "__main__":
-    main()