gabriel-melki commited on
Commit
0e032a7
·
1 Parent(s): 68f7ba1

Reorganize tools into separate modules and add .gitignore

Browse files

- Move tool functions into dedicated modules in tools/ directory
- Add comprehensive .gitignore to prevent binary files from being tracked
- Clean up old test files and logs
- Remove problematic binary files from repository

.gitignore ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ agents_env/
26
+ venv/
27
+ env/
28
+ ENV/
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+
36
+ # OS
37
+ .DS_Store
38
+ Thumbs.db
39
+
40
+ # Media files (for GAIA benchmark)
41
+ *.mp3
42
+ *.mp4
43
+ *.wav
44
+ *.avi
45
+ *.mov
46
+ *.mkv
47
+ *.flv
48
+ *.webm
49
+
50
+ # Image files (if not needed for the app)
51
+ *.jpg
52
+ *.jpeg
53
+ *.png
54
+ *.gif
55
+ *.bmp
56
+ *.tiff
57
+ *.svg
58
+
59
+ # Documents
60
+ *.pdf
61
+ *.doc
62
+ *.docx
63
+ *.xls
64
+ *.xlsx
65
+ *.ppt
66
+ *.pptx
67
+
68
+ # Logs and temporary files
69
+ logs/
70
+ *.log
71
+ *.tmp
72
+ *.temp
73
+
74
+ # Jupyter
75
+ .ipynb_checkpoints/
76
+ *.ipynb
77
+
78
+ # Test files
79
+ test*.py
80
+ *_test.py
README.md DELETED
@@ -1,15 +0,0 @@
1
- ---
2
- title: Template Final Assignment
3
- emoji: 🕵🏻‍♂️
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.25.2
8
- app_file: app.py
9
- pinned: false
10
- hf_oauth: true
11
- # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
12
- hf_oauth_expiration_minutes: 480
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/app.cpython-313.pyc DELETED
Binary file (10.4 kB)
 
__pycache__/wiki_extractor.cpython-313.pyc DELETED
Binary file (15.6 kB)
 
__pycache__/wikipedia_tools.cpython-313.pyc DELETED
Binary file (11.1 kB)
 
agent.py CHANGED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import CodeAgent
2
+ from prompt import get_prompt
3
+
4
+ class QuestionAnsweringAgent(CodeAgent):
5
+ def __init__(self, *args, **kwargs):
6
+ super().__init__(*args, **kwargs)
7
+
8
+ def __call__(self, question_text, file_name) -> str:
9
+ enhanced_question = get_prompt(question_text, file_name)
10
+ response = self.run(enhanced_question, reset=True)
11
+ return response
app.py CHANGED
@@ -1,115 +1,49 @@
1
- from math import e
2
  import os
3
  import gradio as gr
4
  import requests
5
  import pandas as pd
6
- import json
7
- import csv
8
- import openpyxl
9
- import whisper
10
- from prompt import get_prompt
11
- from huggingface_hub import login
12
  from smolagents import (
13
  InferenceClientModel,
14
- FinalAnswerTool,
15
- CodeAgent
16
  )
17
 
18
- from smolagents.tools import tool, Tool
19
- from smolagents.agents import PromptTemplates
20
- from wikipedia_tools import wikipedia_summary, read_wikipedia_page
21
- #login(token=os.environ["HF_TOKEN"])
22
 
23
- # --- Constants ---
24
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
25
 
 
26
 
27
- # --- model initialization ---
28
  model = InferenceClientModel(
29
  provider="auto",
30
- model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct", # Changed to larger model
31
- #timeout=30
 
 
32
  )
33
 
34
- final_answer = FinalAnswerTool()
35
-
 
 
 
 
 
36
 
37
- def _download_file(file_name: str) -> None:
38
- if not os.path.exists(file_name):
39
- url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
40
- r = requests.get(url)
41
- with open(file_name, "wb") as f:
42
- f.write(r.content)
43
-
44
- @tool
45
- def read_file_as_text(file_name: str) -> str:
46
- """
47
- Opens a file and returns its content as readable text.
48
- Supports 'txt', 'json', 'csv', 'xlsx', and 'mp3' (for mp3, it transcribes speech to text).
49
- Args:
50
- file_name (str): The path or name of the file.
51
- Returns:
52
- str: The content of the file as text, or transcribed speech if 'mp3'.
53
- """
54
- _download_file(file_name)
55
- file_type = file_name.split(".")[-1]
56
- try:
57
- if file_type in {"txt", "py"}:
58
- with open(file_name, "r", encoding="utf-8") as f:
59
- return f.read()
60
- elif file_type == "json":
61
- with open(file_name, "r", encoding="utf-8") as f:
62
- data = json.load(f)
63
- return json.dumps(data, indent=2)
64
- elif file_type == "csv":
65
- with open(file_name, "r", encoding="utf-8") as f:
66
- reader = csv.reader(f)
67
- rows = list(reader)
68
- return "\n".join([", ".join(row) for row in rows])
69
- elif file_type == "xlsx":
70
- wb = openpyxl.load_workbook(file_name, data_only=True)
71
- sheet = wb.active
72
- content = []
73
- for row in sheet.iter_rows(values_only=True):
74
- content.append(", ".join(str(cell) if cell is not None else "" for cell in row))
75
- return "\n".join(content)
76
- elif file_type == "mp3":
77
- w = whisper.load_model("base")
78
- res = w.transcribe(file_name)
79
- return res["text"]
80
- else:
81
- return f"File type '{file_type}' not supported."
82
- except FileNotFoundError:
83
- return f"File '{file_name}' not found."
84
- except Exception as e:
85
- return f"Error opening file '{file_name}': {str(e)}"
86
-
87
- # --- Prompt templates ---
88
-
89
-
90
- class QuestionAnsweringAgent(CodeAgent):
91
- def __init__(self, *args, **kwargs):
92
- super().__init__(*args, **kwargs)
93
-
94
- def __call__(self, question_text, file_name) -> str:
95
- enhanced_question = get_prompt(question_text, file_name)
96
- response = self.run(enhanced_question, reset=True)
97
- return response
98
-
99
- # Create agent with all the tools
100
  agent = QuestionAnsweringAgent(
101
  name="question_answering_expert",
102
  model=model,
103
- tools=[final_answer, wikipedia_summary, read_wikipedia_page, read_file_as_text],
104
- add_base_tools=False, # Disable auto base tools to avoid overriding custom visit_webpage
105
  planning_interval=None, # Disable planning to ensure immediate stop after final_answer
106
  additional_authorized_imports=["bs4"],
107
- max_steps=5,
108
- verbosity_level=2, # Add for better debugging
109
- #use_structured_outputs_internally=True # Enable structured output
110
  )
111
-
112
-
113
  def run_and_submit_all( profile: gr.OAuthProfile | None):
114
  """
115
  Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -165,7 +99,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
165
  results_log = []
166
  answers_payload = []
167
  print(f"Running agent on {len(questions_data)} questions...")
168
- for item in questions_data[:1]:
169
  task_id = item.get("task_id")
170
  question_text = item.get("question")
171
  file_name = item.get("file_name")
 
 
1
  import os
2
  import gradio as gr
3
  import requests
4
  import pandas as pd
5
+
 
 
 
 
 
6
  from smolagents import (
7
  InferenceClientModel,
8
+ FinalAnswerTool
 
9
  )
10
 
11
+ from tools.wikipedia_tools import wikipedia_summary, read_wikipedia_page
12
+ from tools.file_tools import read_file_as_text
13
+ from tools.youtube_tools import download_youtube_url_images, download_youtube_url_audio
14
+ from tools.image_processing_tools import ask_question_about_image
15
 
16
+ from agent import QuestionAnsweringAgent
 
17
 
18
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
19
 
 
20
  model = InferenceClientModel(
21
  provider="auto",
22
+ model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
23
+ temperature=0,
24
+ top_p=1.0,
25
+ seed=42,
26
  )
27
 
28
+ agent_tools = [
29
+ FinalAnswerTool(),
30
+ wikipedia_summary, read_wikipedia_page,
31
+ read_file_as_text,
32
+ download_youtube_url_images, download_youtube_url_audio,
33
+ ask_question_about_image
34
+ ]
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  agent = QuestionAnsweringAgent(
37
  name="question_answering_expert",
38
  model=model,
39
+ tools=agent_tools,
40
+ add_base_tools=True, # Disable auto base tools to avoid overriding custom visit_webpage
41
  planning_interval=None, # Disable planning to ensure immediate stop after final_answer
42
  additional_authorized_imports=["bs4"],
43
+ max_steps=10,
44
+ verbosity_level=2, # For better debugging
 
45
  )
46
+
 
47
  def run_and_submit_all( profile: gr.OAuthProfile | None):
48
  """
49
  Fetches all questions, runs the BasicAgent on them, submits all answers,
 
99
  results_log = []
100
  answers_payload = []
101
  print(f"Running agent on {len(questions_data)} questions...")
102
+ for item in questions_data[:]:
103
  task_id = item.get("task_id")
104
  question_text = item.get("question")
105
  file_name = item.get("file_name")
logs DELETED
@@ -1,221 +0,0 @@
1
-
2
- Launching Gradio Interface for Basic Agent Evaluation...
3
- * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr_mode=False` in `launch()`)
4
-
5
- To create a public link, set `share=True` in `launch()`.
6
- User logged in: gabzer
7
- https://huggingface.co/spaces/gabzer/GAIA_benchmark_agent/tree/main
8
- Fetching questions from: https://agents-course-unit4-scoring.hf.space/questions
9
- Fetched 20 questions.
10
- Running agent on 20 questions...
11
- === Starting agent run ===
12
- ╭──────────────────── New run - question_answering_expert ─────────────────────╮
13
- │ │
14
- │ You are a highly precise question-answering agent. │
15
- │ When given a question: │
16
- │ - If necessary, perform a wikipedia search using the │
17
- │ `wikipedia_search` tool to find possible sources of information. For the │
18
- │ `query` parameter of the `wikipedia_search` tool, enter only the name of the │
19
- │ person, the place, or the event you want to search. Not something too long. │
20
- │ - If necessary, perform a web search using the `web_search` tool to │
21
- │ find possible sources of information. │
22
- │ - If the web search only returns titles and short snippets, you MUST │
23
- │ visit the actual webpage using the `visit_webpage` tool to read the full │
24
- │ content before answering. │
25
- │ - If the task requires reading, listening, or analyzing a file, you │
26
- │ must use the file specified after the question, NOT the file name mentioned │
27
- │ casually inside the question text. │
28
- │ - Comma separated lists MUST contain a single space after each │
29
- │ comma. │
30
- │ - If you are asked for a number, don't use comma to write your │
31
- │ number, nor use units such as $$ or percent sign unless specified otherwise. │
32
- │ - If you are asked for a string, don't use articles, nor │
33
- │ abbreviations (e.g. for cities), and write the digits in plain text unless │
34
- │ specified otherwise. │
35
- │ - If you are asked for a comma separated list, apply the above rules │
36
- │ depending of whether the element to be put in the list is a number or a │
37
- │ string. │
38
- │ - Only answer after you have gathered enough information by reading │
39
- │ the actual page contents. │
40
- │ - Only answer after you have printed out the final answer first. │
41
- │ - Once you have obtained the final answer, you MUST make a code call │
42
- │ as follows: │
43
- │ <code> │
44
- │ final_answer("your_answer") │
45
- │ </code> │
46
- │ to submit the final answer. │
47
- │ - Do not retry or execute anything else after calling │
48
- │ `final_answer`. STOP IMMEDIATELY. │
49
- │ - Calling `final_answer` terminates the task completely. No further │
50
- │ steps are needed. │
51
- │ - The function `final_answer` must wrap the exact printed value. │
52
- │ - Provide ONLY the precise answer requested. │
53
- │ - Do not include explanations, steps, reasoning, or additional text │
54
- │ when calling `final_answer`. │
55
- │ - Be direct and specific. The GAIA benchmark requires exactly │
56
- │ matching answers. │
57
- │ │
58
- │ Based on the above guidelines, answer the following question: │
59
- │ -- beginning of question -- │
60
- │ How many studio albums were published by Mercedes Sosa between 2000 │
61
- │ and 2009 (included)? You can use the latest 2022 version of english │
62
- │ wikipedia. │
63
- │ -- end of question -- │
64
- │ If the questions mentions the need to use a file, use the following │
65
- │ `file_name` value below as the `file_name` parameter in any function calls: │
66
- │ file_name: │
67
- │ │
68
- ╰─ InferenceClientModel - Qwen/Qwen2.5-Coder-7B-Instruct ──────────────────────╯
69
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
70
- Output message of the LLM: ─────────────────────────────────────────────────────
71
- Thought: To find how many studio albums Mercedes Sosa published between 2000 and
72
- 2009, I need to look up her discography on Wikipedia. This will give me the
73
- information I require.
74
- <code>
75
- sosa_discography = wikipedia_search("Mercedes Sosa discography")
76
- print(sosa_discography)
77
-
78
- ─ Executing parsed code: ─────────────────────────────────────────────────────
79
- sosa_discography = wikipedia_search("Mercedes Sosa discography")
80
- print(sosa_discography)
81
- ──────────────────────────────────────────────────────────────────────────────
82
- Execution logs:
83
- No Wikipedia page found for 'Mercedes Sosa discography'. Try a different query.
84
-
85
- Out: None
86
- [Step 1: Duration 2.08 seconds| Input tokens: 2,734 | Output tokens: 72]
87
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
88
- Output message of the LLM: ─────────────────────────────────────────────────────
89
- Thought: The previous query didn't yield any results. It could be due to the
90
- specificity or the lack of a recent entry. I'll broaden the query to search for
91
- "Mercedes Sosa career".
92
- Using broadened query for better results.
93
- <code>
94
- sosa_career = wikipedia_search("Mercedes Sosa career")
95
- print(sosa_career)
96
-
97
- ─ Executing parsed code: ─────────────────────────────────────────────────────
98
- sosa_career = wikipedia_search("Mercedes Sosa career")
99
- print(sosa_career)
100
- ──────────────────────────────────────────────────────────────────────────────
101
- Execution logs:
102
- No Wikipedia page found for 'Mercedes Sosa career'. Try a different query.
103
-
104
- Out: None
105
- [Step 2: Duration 1.96 seconds| Input tokens: 5,639 | Output tokens: 149]
106
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
107
- Output message of the LLM: ─────────────────────────────────────────────────────
108
- Thought: Searching for "Mercedes Sosa's album releases" may be more accurate
109
- since I am specifically looking for studio albums.
110
- <code>
111
- sosa_album_releases = wikipedia_search("Mercedes Sosa's album releases")
112
- print(sosa_album_releases)
113
-
114
- ─ Executing parsed code: ─────────────────────────────────────────────────���───
115
- sosa_album_releases = wikipedia_search("Mercedes Sosa's album releases")
116
- print(sosa_album_releases)
117
- ──────────────────────────────────────────────────────────────────────────────
118
- Execution logs:
119
- No Wikipedia page found for 'Mercedes Sosa's album releases'. Try a different
120
- query.
121
-
122
- Out: None
123
- [Step 3: Duration 1.82 seconds| Input tokens: 8,720 | Output tokens: 205]
124
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 4 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
125
- Output message of the LLM: ─────────────────────────────────────────────────────
126
- Thought: Given that previous searches were unsuccessful, let me perform a
127
- general web search for "Mercedes Sosa albums between 2000 and 2009" and examine
128
- the results for relevant information.
129
- <code>
130
- album_info = web_search("Mercedes Sosa albums between 2000 and 2009")
131
- print(album_info)
132
-
133
- ─ Executing parsed code: ─────────────────────────────────────────────────────
134
- album_info = web_search("Mercedes Sosa albums between 2000 and 2009")
135
- print(album_info)
136
- ──────────────────────────────────────────────────────────────────────────────
137
- Execution logs:
138
- ## Search Results
139
-
140
- [Category: Mercedes Sosa albums -
141
- Wikipedia](https://en.wikipedia.org/wiki/Category:Mercedes_Sosa_albums)
142
- This is a set category. It should only contain pages that are Mercedes Sosa
143
- albums or lists of Mercedes Sosa albums , as well as subcategories containing
144
- those things (themselves set categories).
145
-
146
- [Mercedes Sosa Albums and
147
- Discography](https://genius.com/artists/Mercedes-sosa/albums)
148
- All Albums by Mercedes Sosa . Mercedes Sosa discography includes 45 albums .
149
-
150
- [Mercedes Sosa | Discografia |
151
- Discogs](https://www.discogs.com/it/artist/333361-Mercedes-Sosa)
152
- Mercedes Sosa , known as La Negra, (born July 9, 1935 in San Miguel de Tucuman,
153
- Argentina – Death October 4, 2009 in Buenos Aires) was an Argentine singer who
154
- was and remains immensely popular throughout Latin America and internationally.
155
-
156
- [Mercedes Sosa - Apple
157
- Music](https://music.apple.com/tc/artist/mercedes-sosa/122968)
158
- Mercedes Sosa . Latest Release. 24 NOV 2024. Mercedes Sosa en New York, 1974 ·
159
- 2024. Palabras de Mercedes IX (En Vivo NY).
160
-
161
- [Mercedes Sosa : albums , chansons, concerts |
162
- Deezer](https://www.deezer.com/fr/artist/10606)
163
- Mercedes Sosa sur Deezer : discographie de l'artiste, top albums et chansons,
164
- concerts et featurings.
165
-
166
- [Mercedes Sosa on TIDAL](https://tidal.com/browse/artist/3501549)
167
- Albums . Mercedes Sosa en New York, 1974. Mercedes Sosa . 2009 . Zona de
168
- Promesas (with Gustavo Cerati).
169
-
170
- [Mercedes Sosa : Best songs, Albums and Concerts -
171
- Mozaart](https://mozaart.com/en/a/mercedes-sosa)
172
- Haydée Mercedes Sosa (Spanish pronunciation: [meɾˈseðes ˈsosa]; 9 July 1935 – 4
173
- October 2009 ), sometimes known as La Negra (literally: The Black One), was an
174
- Argentine singer who was popular throughout Latin America and many countries
175
- outsid.
176
-
177
- [Mercedes Sosa - Albums](https://muzikum.eu/en/mercedes-sosa/discography)
178
- Mercedes Sosa . 23 Albums found.Reyes Records (2003). Mercedes Sosa - A Arte De
179
- (bradypus Series).
180
-
181
- [Cantora 1 - Mercedes Sosa | Album |
182
- AllMusic](https://www.allmusic.com/album/cantora-1-mw0000827785)
183
- Cantora 1 by Mercedes Sosa released in 2009 . Find album reviews, track lists,
184
- credits, awards and more at AllMusic.
185
-
186
- [Mercedes Sosa Albums , Songs - Discography - Album of The
187
- Year](https://www.albumoftheyear.org/artist/33792-mercedes-sosa/?type=live)
188
- Information on Mercedes Sosa . Complete discography, ratings, reviews and
189
- more.1982. Mercedes Sosa En Argentina. Live.
190
-
191
- Out: None
192
- [Step 4: Duration 3.07 seconds| Input tokens: 11,961 | Output tokens: 283]
193
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 5 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
194
- Output message of the LLM: ──────────────────────────────��──────────────────────
195
- Thought: After reviewing the web search results, it is apparent that there is
196
- limited information available about Mercedes Sosa releasing albums specifically
197
- between 2000 and 2009. The most relevant result indicates a live album released
198
- in 1982 and a song release in 2009, but not studio albums in this range.
199
-
200
- As no specific information about the number ofstudio albums released by Mercedes
201
- Sosa between 2000 and 2009 was found, I cannot determine the exact count.
202
- Therefore, I will finalize my answer as "unknown."
203
- <code>
204
- final_answer("unknown")
205
-
206
- ─ Executing parsed code: ─────────────────────────────────────────────────────
207
- final_answer("unknown")
208
- ──────────────────────────────────────────────────────────────────────────────
209
- Final answer: unknown
210
- [Step 5: Duration 2.95 seconds| Input tokens: 16,092 | Output tokens: 414]
211
- >>> Agent returned: unknown
212
- Task 8e867cd7-cff9-4e6c-867a-ff5ddc2550be execution steps: 6
213
- Step 1: TaskStep
214
- Step 2: ActionStep
215
- Step 3: ActionStep
216
- Step 4: ActionStep
217
- Step 5: ActionStep
218
- Step 6: ActionStep
219
- Agent finished. Submitting 1 answers for user 'gabzer'...
220
- Submitting 1 answers to: https://agents-course-unit4-scoring.hf.space/submit
221
- Submission Failed: Server responded with status 422. Detail: [{'type': 'string_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'str'], 'msg': 'Input should be a valid string', 'input': None}, {'type': 'int_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'int'], 'msg': 'Input should be a valid integer', 'input': None}, {'type': 'float_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'float'], 'msg': 'Input should be a valid number', 'input': None}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompt.py CHANGED
@@ -11,6 +11,7 @@ def get_prompt(question_text, file_name):
11
  - "Battle of Hastings timeline" → use: wikipedia_summary("Battle of Hastings")
12
  - "Population of Paris in 2010" → use: wikipedia_summary("Paris")
13
  - If necessary, visit the wikipedia page listed in the wikipedia summary tool to read the full content. You will find the page url in the output of the wikipedia summary tool at the end after the **Read more:** section. Use the `read_wikipedia_page` tool to visit the page.
 
14
  - If necessary, perform a web search using the `web_search` tool to find possible sources of information.
15
  - If the web search only returns titles and short snippets, you MUST visit the actual webpage using the `read_wikipedia_page` tool to read the full content before answering.
16
  - If the task requires reading, listening, or analyzing a file, you must use the file specified after the question, NOT the file name mentioned casually inside the question text.
 
11
  - "Battle of Hastings timeline" → use: wikipedia_summary("Battle of Hastings")
12
  - "Population of Paris in 2010" → use: wikipedia_summary("Paris")
13
  - If necessary, visit the wikipedia page listed in the wikipedia summary tool to read the full content. You will find the page url in the output of the wikipedia summary tool at the end after the **Read more:** section. Use the `read_wikipedia_page` tool to visit the page.
14
+ - When using the `read_wikipedia_page` tool, you may find tables in the page. To analyze the tables, please use a code snippet to read the tables into a pandas dataframe and analyze the data.
15
  - If necessary, perform a web search using the `web_search` tool to find possible sources of information.
16
  - If the web search only returns titles and short snippets, you MUST visit the actual webpage using the `read_wikipedia_page` tool to read the full content before answering.
17
  - If the task requires reading, listening, or analyzing a file, you must use the file specified after the question, NOT the file name mentioned casually inside the question text.
requirements.txt CHANGED
@@ -3,12 +3,12 @@ requests==2.32.5
3
  smolagents==1.21.3
4
  duckduckgo-search==8.1.1
5
  ddgs==9.5.5
6
- requests==2.32.5
7
  markdownify==0.11.0
8
  openpyxl==3.1.5
9
  wikipedia-api==0.8.1
10
- whisper==1.1.10
11
  beautifulsoup4==4.12.3
12
  langchain_community==0.3.2
13
  wikipedia==1.4.0
14
- tabulate==0.9.0
 
 
3
  smolagents==1.21.3
4
  duckduckgo-search==8.1.1
5
  ddgs==9.5.5
 
6
  markdownify==0.11.0
7
  openpyxl==3.1.5
8
  wikipedia-api==0.8.1
9
+ openai-whisper==20250625
10
  beautifulsoup4==4.12.3
11
  langchain_community==0.3.2
12
  wikipedia==1.4.0
13
+ tabulate==0.9.0
14
+ yt-dlp==2025.9.5
test.ipynb DELETED
@@ -1,262 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 4,
6
- "id": "289bbe12",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "from smolagents import WikipediaSearchTool \n",
11
- "wikipedia_search = WikipediaSearchTool(\n",
12
- " user_agent=f\"My research agent ([email protected])\",\n",
13
- " language=\"en\",\n",
14
- " content_type=\"text\",\n",
15
- " extract_format=\"HTML\",\n",
16
- ")"
17
- ]
18
- },
19
- {
20
- "cell_type": "code",
21
- "execution_count": 2,
22
- "id": "0e69fd46",
23
- "metadata": {},
24
- "outputs": [
25
- {
26
- "data": {
27
- "text/plain": [
28
- "{'summary': '\\n The sections inside the page are Life, Awards, Death, Tributes, Discography, Filmography, References, Further reading, External links and the summary of the page is Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an Argentine singer who was popular throughout Latin America and many countries outside the region. With her roots in Argentine folk music, Sosa became one of the preeminent exponents of El nuevo cancionero. She gave voice to songs written by many Latin American songwriters. Her music made people hail her as the \"voice of the voiceless ones\". She was often called \"the conscience of Latin America\".\\nSosa performed in venues such as the Lincoln Center in New York City, the Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as sold-out shows in New York\\'s Carnegie Hall and the Roman Colosseum during her final decade of life. Her career spanned four decades and she was the recipient of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000, the main musical award in Argentina. She served as an ambassador for UNICEF.\\n ',\n",
29
- " 'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa'}"
30
- ]
31
- },
32
- "execution_count": 2,
33
- "metadata": {},
34
- "output_type": "execute_result"
35
- }
36
- ],
37
- "source": [
38
- "from wikipedia_tools import wikipedia_summary, read_wikipedia_page\n",
39
- "url = \"https://en.wikipedia.org/wiki/Mercedes_Sosa\"\n",
40
- "query = \"Mercedes Sosa\"\n",
41
- "wikipedia_summary(query=query)"
42
- ]
43
- },
44
- {
45
- "cell_type": "code",
46
- "execution_count": 8,
47
- "id": "034ffd47",
48
- "metadata": {},
49
- "outputs": [
50
- {
51
- "data": {
52
- "text/plain": [
53
- "{'summary': 'Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an Argentine singer who was popular throughout Latin America and many countries outside the region. With her roots in Argentine folk music, Sosa became one of the preeminent exponents of El nuevo cancionero. She gave voice to songs written by many Latin American songwriters. Her music made people hail her as the \"voice of the voiceless ones\". She was often called \"the conscience of Latin America\".\\nSosa performed in venues such as the Lincoln Center in New York City, the Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as sold-out shows in New York\\'s Carnegie Hall and the Roman Colosseum during her final decade of life. Her career spanned four decades and she was the recipient of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000, the main musical award in Argentina. She served as an ambassador for UNICEF.',\n",
54
- " 'url': 'https://en.wikipedia.org/wiki/Mercedes Sosa'}"
55
- ]
56
- },
57
- "execution_count": 8,
58
- "metadata": {},
59
- "output_type": "execute_result"
60
- }
61
- ],
62
- "source": [
63
- "import wikipediaapi\n",
64
- "import os\n",
65
- "query=\"Mercedes Sosa\"\n",
66
- "\"\"\"\n",
67
- "Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.\n",
68
- "Args:\n",
69
- " query: The query to search for.\n",
70
- "Returns:\n",
71
- " A dictionary with the summary of the page and the url of the page.\n",
72
- "\"\"\"\n",
73
- "summary_tool = wikipediaapi.Wikipedia(\n",
74
- " user_agent=f\"My research agent ({os.getenv('USER_EMAIL')})\",\n",
75
- " extra_api_params={\"include\": \"url\"}\n",
76
- ")\n",
77
- "page = summary_tool.page(query)\n",
78
- "if not page.exists():\n",
79
- " raise ValueError(f\"No Wikipedia page found for '{query}'. Try a different query.\")\n",
80
- "{\n",
81
- " \"summary\": page.summary,\n",
82
- " \"url\": f\"https://en.wikipedia.org/wiki/{page.title}\"\n",
83
- "}"
84
- ]
85
- },
86
- {
87
- "cell_type": "code",
88
- "execution_count": 19,
89
- "id": "9992b1ec",
90
- "metadata": {},
91
- "outputs": [
92
- {
93
- "data": {
94
- "text/plain": [
95
- "Mercedes Sosa (lang: en, variant: None, id: 476992, ns: 0)"
96
- ]
97
- },
98
- "execution_count": 19,
99
- "metadata": {},
100
- "output_type": "execute_result"
101
- }
102
- ],
103
- "source": [
104
- "page"
105
- ]
106
- },
107
- {
108
- "cell_type": "code",
109
- "execution_count": 3,
110
- "id": "11d4a9f8",
111
- "metadata": {},
112
- "outputs": [
113
- {
114
- "data": {
115
- "text/plain": [
116
- "'https://en.wikipedia.org/wiki/Mercedes_Sosa'"
117
- ]
118
- },
119
- "execution_count": 3,
120
- "metadata": {},
121
- "output_type": "execute_result"
122
- }
123
- ],
124
- "source": [
125
- "import wikipedia\n",
126
- "ny = wikipedia.page(pageid=476992)\n",
127
- "ny.url\n",
128
- "# u'http://en.wikipedia.org/wiki/New_York'"
129
- ]
130
- },
131
- {
132
- "cell_type": "code",
133
- "execution_count": 11,
134
- "id": "577562bd",
135
- "metadata": {},
136
- "outputs": [
137
- {
138
- "data": {
139
- "text/plain": [
140
- "{'pageid': '476992',\n",
141
- " 'title': 'Mercedes Sosa',\n",
142
- " 'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa',\n",
143
- " '_sections': [],\n",
144
- " '_categories': ['1935 births',\n",
145
- " '2009 deaths',\n",
146
- " '20th-century Argentine women singers',\n",
147
- " '20th-century drummers',\n",
148
- " 'All articles with dead external links',\n",
149
- " 'All articles with unsourced statements',\n",
150
- " 'Argentine activists',\n",
151
- " 'Argentine people of Diaguita descent',\n",
152
- " 'Argentine people of French descent',\n",
153
- " 'Argentine people of Quechua descent',\n",
154
- " 'Argentine women activists',\n",
155
- " 'Articles with Brazilian Portuguese-language sources (pt-br)',\n",
156
- " 'Articles with German-language sources (de)',\n",
157
- " 'Articles with Spanish-language sources (es)',\n",
158
- " 'Articles with dead external links from June 2024',\n",
159
- " 'Articles with hCards',\n",
160
- " 'Articles with short description',\n",
161
- " 'Articles with unsourced statements from December 2023',\n",
162
- " 'Bombo legüero players',\n",
163
- " 'CS1 Spanish-language sources (es)',\n",
164
- " 'Commons category link is on Wikidata',\n",
165
- " 'Deaths from kidney failure in Argentina',\n",
166
- " 'Latin Grammy Award winners',\n",
167
- " 'Latin Grammy Lifetime Achievement Award winners',\n",
168
- " 'Nueva canción musicians',\n",
169
- " 'People from San Miguel de Tucumán',\n",
170
- " 'Recipients of the Order of Cultural Merit (Brazil)',\n",
171
- " 'Short description is different from Wikidata',\n",
172
- " 'Use dmy dates from July 2025',\n",
173
- " 'Webarchive template wayback links',\n",
174
- " 'Wikipedia indefinitely move-protected pages',\n",
175
- " 'Women in Latin music']}"
176
- ]
177
- },
178
- "execution_count": 11,
179
- "metadata": {},
180
- "output_type": "execute_result"
181
- }
182
- ],
183
- "source": [
184
- "ny.__dict__"
185
- ]
186
- },
187
- {
188
- "cell_type": "code",
189
- "execution_count": 20,
190
- "id": "9fa24efc",
191
- "metadata": {},
192
- "outputs": [],
193
- "source": [
194
- "import wikipediaapi\n",
195
- "import os\n",
196
- "query=\"Mercedes Sosa\"\n",
197
- "summary_tool = wikipediaapi.Wikipedia(\n",
198
- " user_agent=f\"My research agent ({os.getenv('USER_EMAIL')})\",\n",
199
- ")\n",
200
- "page = summary_tool.page(query)"
201
- ]
202
- },
203
- {
204
- "cell_type": "code",
205
- "execution_count": 24,
206
- "id": "8589fd42",
207
- "metadata": {},
208
- "outputs": [
209
- {
210
- "data": {
211
- "text/plain": [
212
- "['Life',\n",
213
- " 'Awards',\n",
214
- " 'Death',\n",
215
- " 'Tributes',\n",
216
- " 'Discography',\n",
217
- " 'Filmography',\n",
218
- " 'References',\n",
219
- " 'Further reading',\n",
220
- " 'External links']"
221
- ]
222
- },
223
- "execution_count": 24,
224
- "metadata": {},
225
- "output_type": "execute_result"
226
- }
227
- ],
228
- "source": [
229
- "[section._title for section in page.sections]"
230
- ]
231
- },
232
- {
233
- "cell_type": "code",
234
- "execution_count": null,
235
- "id": "38a8b06c",
236
- "metadata": {},
237
- "outputs": [],
238
- "source": []
239
- }
240
- ],
241
- "metadata": {
242
- "kernelspec": {
243
- "display_name": "agents_env",
244
- "language": "python",
245
- "name": "python3"
246
- },
247
- "language_info": {
248
- "codemirror_mode": {
249
- "name": "ipython",
250
- "version": 3
251
- },
252
- "file_extension": ".py",
253
- "mimetype": "text/x-python",
254
- "name": "python",
255
- "nbconvert_exporter": "python",
256
- "pygments_lexer": "ipython3",
257
- "version": "3.13.6"
258
- }
259
- },
260
- "nbformat": 4,
261
- "nbformat_minor": 5
262
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_executed.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
tools.py DELETED
File without changes
tools/file_tools.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import csv
4
+ import openpyxl
5
+ import whisper
6
+ import os
7
+ import requests
8
+ from smolagents.tools import tool
9
+
10
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
+
12
+ def _download_file(file_name: str) -> None:
13
+ if not os.path.exists(file_name):
14
+ url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
15
+ r = requests.get(url)
16
+ with open(file_name, "wb") as f:
17
+ f.write(r.content)
18
+
19
+ @tool
20
+ def read_file_as_text(file_name: str) -> str:
21
+ """
22
+ Opens a file and returns its content as readable text.
23
+ Supports 'txt', 'json', 'csv', 'xlsx', and 'mp3' (for mp3, it transcribes speech to text).
24
+ Args:
25
+ file_name (str): The path or name of the file.
26
+ Returns:
27
+ str: The content of the file as text, or transcribed speech if 'mp3'.
28
+ """
29
+ _download_file(file_name)
30
+ file_type = file_name.split(".")[-1]
31
+ try:
32
+ if file_type in {"txt", "py"}:
33
+ with open(file_name, "r", encoding="utf-8") as f:
34
+ return f.read()
35
+ elif file_type == "json":
36
+ with open(file_name, "r", encoding="utf-8") as f:
37
+ data = json.load(f)
38
+ return json.dumps(data, indent=2)
39
+ elif file_type == "csv":
40
+ with open(file_name, "r", encoding="utf-8") as f:
41
+ reader = csv.reader(f)
42
+ rows = list(reader)
43
+ return "\n".join([", ".join(row) for row in rows])
44
+ elif file_type == "xlsx":
45
+ wb = openpyxl.load_workbook(file_name, data_only=True)
46
+ sheet = wb.active
47
+ content = []
48
+ for row in sheet.iter_rows(values_only=True):
49
+ content.append(", ".join(str(cell) if cell is not None else "" for cell in row))
50
+ return "\n".join(content)
51
+ elif file_type == "mp3":
52
+ w = whisper.load_model("base")
53
+ res = w.transcribe(file_name)
54
+ return res["text"]
55
+ else:
56
+ return f"File type '{file_type}' not supported."
57
+ except FileNotFoundError:
58
+ return f"File '{file_name}' not found."
59
+ except Exception as e:
60
+ return f"Error opening file '{file_name}': {str(e)}"
tools/image_processing_tools.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from transformers import pipeline
3
+ from smolagents.tools import tool
4
+
5
+ @tool
6
+ def ask_question_about_image(question: str, path_to_image: str) -> str:
7
+ """
8
+ Ask a question about an image and return the answer.
9
+ Args:
10
+ question: the question to ask about the image.
11
+ path_to_image: The path to the image to ask the question about.
12
+ Returns:
13
+ A string with the answer to the question.
14
+ """
15
+ pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
16
+
17
+ image = Image.open(fp=path_to_image)
18
+
19
+ messages = [
20
+ {
21
+ "role": "user",
22
+ "content": [
23
+ {
24
+ "type": "image",
25
+ "image": image,
26
+ },
27
+ {"type": "text", "text": question},
28
+ ],
29
+ }
30
+ ]
31
+
32
+ outputs = pipe(text=messages, max_new_tokens=60, return_full_text=False)
33
+
34
+ return outputs[0]["generated_text"]
35
+
wikipedia_tools.py → tools/wikipedia_tools.py RENAMED
@@ -5,6 +5,8 @@ import pandas as pd
5
  from bs4 import BeautifulSoup
6
  from smolagents.tools import tool
7
  import wikipediaapi
 
 
8
  def fetch_wikipedia_page(url: str) -> str:
9
  """Fetch raw HTML of a Wikipedia page."""
10
  headers = {
@@ -30,26 +32,68 @@ def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
30
  excluded = {_normalize_title(t) for t in titles}
31
  header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
32
 
33
- for header in list(soup.find_all(header_tags)):
 
 
34
  title_text = _normalize_title(header.get_text(" ", strip=True))
35
  if title_text in excluded:
36
- level = int(header.name[1])
37
- nodes_to_remove = [header]
38
- for sibling in list(header.next_siblings):
39
- sib_name = getattr(sibling, "name", None)
40
- if sib_name in header_tags:
41
- next_level = int(sibling.name[1])
42
- if next_level <= level:
43
- break
44
- nodes_to_remove.append(sibling)
45
- for node in nodes_to_remove:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  try:
47
- node.decompose()
48
  except Exception:
49
- try:
50
- node.extract()
51
- except Exception:
52
- pass
53
 
54
 
55
  def _cleanup_non_content(root: BeautifulSoup) -> None:
@@ -85,20 +129,73 @@ def _cleanup_non_content(root: BeautifulSoup) -> None:
85
 
86
 
87
  def extract_text(soup: BeautifulSoup) -> str:
88
- """Extract main text (paragraphs + headers + lists) from article body only."""
 
 
 
89
  content_root = soup.select_one("div.mw-parser-output") or soup
90
 
91
  for elem in content_root(["script", "style", "sup", "aside", "nav"]):
92
  elem.decompose()
93
  _cleanup_non_content(content_root)
94
 
95
- headers = [h.get_text(" ", strip=True) for h in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
96
- paragraphs = [p.get_text(" ", strip=True) for p in content_root.find_all("p")]
97
- lists = [li.get_text(" ", strip=True) for li in content_root.find_all("li")]
 
 
 
 
 
 
 
98
 
99
- all_text = headers + paragraphs + lists
100
- clean = [t for t in all_text if len(t) > 30]
101
- return "\n\n".join(clean)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
 
104
  def extract_tables(soup: BeautifulSoup) -> list[dict]:
@@ -146,13 +243,12 @@ def extract_tables(soup: BeautifulSoup) -> list[dict]:
146
  return tables
147
 
148
 
149
- def format_for_llm(text: str, tables: list[dict], spec: dict) -> str:
150
  """Combine text + tables into a single string for LLM input."""
151
  output = []
152
  output.append("=== ARTICLE TEXT ===\n")
153
  output.append(text)
154
 
155
- sections_to_exclude = spec.get("sections_to_exclude", [])
156
  excluded = {_normalize_title(s) for s in sections_to_exclude}
157
  filtered_tables = [
158
  t for t in tables if _normalize_title(t.get("name", "")) not in excluded
@@ -191,26 +287,23 @@ def wikipedia_summary(entity: str) -> dict:
191
  }
192
 
193
 
194
-
195
  @tool
196
  def read_wikipedia_page(
197
  url: str,
198
- spec: dict = {
199
- "sections_to_exclude": [
200
- "External links",
201
- "References",
202
- "Further reading",
203
- "See also",
204
- "Notes",
205
- ]
206
- }) -> str:
207
  """
208
- Read a Wikipedia page and return a dictionary with the text of the page.
209
  Args:
210
  url: The URL of the Wikipedia page to read.
211
- spec: A dictionary specifying sections to exclude from the page.
212
  Returns:
213
- A dictionary with the text of the page.
214
  """
215
  if "https://en.wikipedia.org/wiki/" not in url:
216
  raise ValueError("URL is required")
@@ -219,12 +312,12 @@ def read_wikipedia_page(
219
  # Parse the page
220
  soup = BeautifulSoup(html, "html.parser")
221
  # Remove unwanted sections
222
- _remove_sections_by_titles(soup, spec.get("sections_to_exclude", []))
223
 
224
  # Extract after pruning unwanted sections
225
  text = extract_text(soup)
226
  tables = extract_tables(soup)
227
 
228
  # Combine
229
- llm_ready = format_for_llm(text, tables, spec)
230
  return llm_ready
 
5
  from bs4 import BeautifulSoup
6
  from smolagents.tools import tool
7
  import wikipediaapi
8
+
9
+
10
  def fetch_wikipedia_page(url: str) -> str:
11
  """Fetch raw HTML of a Wikipedia page."""
12
  headers = {
 
32
  excluded = {_normalize_title(t) for t in titles}
33
  header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
34
 
35
+ # Find all headers that match excluded titles
36
+ headers_to_remove = []
37
+ for header in soup.find_all(header_tags):
38
  title_text = _normalize_title(header.get_text(" ", strip=True))
39
  if title_text in excluded:
40
+ headers_to_remove.append(header)
41
+
42
+ # Remove each matching section (header + content)
43
+ for header in headers_to_remove:
44
+ # Skip if header was already removed as part of another section
45
+ if not header.parent:
46
+ continue
47
+
48
+ level = int(header.name[1])
49
+
50
+ # Determine the container to remove - could be the header itself or its parent wrapper
51
+ header_container = header
52
+ # If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
53
+ if (header.parent and
54
+ header.parent.name == 'div' and
55
+ header.parent.get('class') and
56
+ any('heading' in cls.lower() for cls in header.parent.get('class', []))):
57
+ header_container = header.parent
58
+
59
+ nodes_to_remove = [header_container]
60
+
61
+ # Collect all content after the header container until next header of same/higher level
62
+ current = header_container
63
+ while current.next_sibling:
64
+ current = current.next_sibling
65
+ sib_name = getattr(current, "name", None)
66
+
67
+ # If we hit another header (directly or within a heading container), check its level
68
+ next_header = None
69
+ if sib_name in header_tags:
70
+ next_header = current
71
+ elif (sib_name == 'div' and
72
+ current.get('class') and
73
+ any('heading' in cls.lower() for cls in current.get('class', []))):
74
+ # This is a heading container, find the header inside it
75
+ for child in current.find_all(header_tags):
76
+ next_header = child
77
+ break
78
+
79
+ if next_header:
80
+ next_level = int(next_header.name[1])
81
+ if next_level <= level:
82
+ # This is a header of same or higher level - stop here
83
+ break
84
+
85
+ # Add this node to removal list
86
+ nodes_to_remove.append(current)
87
+
88
+ # Remove all collected nodes
89
+ for node in nodes_to_remove:
90
+ try:
91
+ node.decompose()
92
+ except Exception:
93
  try:
94
+ node.extract()
95
  except Exception:
96
+ pass
 
 
 
97
 
98
 
99
  def _cleanup_non_content(root: BeautifulSoup) -> None:
 
129
 
130
 
131
  def extract_text(soup: BeautifulSoup) -> str:
132
+ """Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
133
+ Excludes content that's inside tables and excludes headers that are also used as
134
+ table names (either as <caption> or the nearest previous header) to avoid duplication
135
+ with extract_tables."""
136
  content_root = soup.select_one("div.mw-parser-output") or soup
137
 
138
  for elem in content_root(["script", "style", "sup", "aside", "nav"]):
139
  elem.decompose()
140
  _cleanup_non_content(content_root)
141
 
142
+ # Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
143
+ table_names_normalized = set()
144
+ for table in content_root.find_all("table"):
145
+ # Skip non-content tables (same logic as extract_tables)
146
+ classes = table.get("class", [])
147
+ if isinstance(classes, list) and any(
148
+ c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
149
+ for c in classes
150
+ ):
151
+ continue
152
 
153
+ name_text = None
154
+ caption_el = table.find("caption")
155
+ if caption_el:
156
+ caption_text = caption_el.get_text(" ", strip=True)
157
+ if caption_text:
158
+ name_text = caption_text
159
+ else:
160
+ # Empty caption: treat as no caption and fallback to previous header
161
+ prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
162
+ if prev_header:
163
+ name_text = prev_header.get_text(" ", strip=True)
164
+ else:
165
+ prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
166
+ if prev_header:
167
+ name_text = prev_header.get_text(" ", strip=True)
168
+
169
+ if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
170
+ name_text = "Infobox"
171
+
172
+ if name_text:
173
+ table_names_normalized.add(_normalize_title(name_text))
174
+
175
+ # Find all text elements in document order, but exclude duplicates
176
+ text_elements = []
177
+ for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
178
+ # Skip elements that are inside a table (to avoid duplication with extract_tables)
179
+ if element.find_parent("table"):
180
+ continue
181
+
182
+ # Skip headers that match any table name (to avoid duplication with extract_tables)
183
+ if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
184
+ header_text_norm = _normalize_title(element.get_text(" ", strip=True))
185
+ if header_text_norm in table_names_normalized:
186
+ continue
187
+
188
+ # Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
189
+ if element.name == "li":
190
+ li_text_norm = _normalize_title(element.get_text(" ", strip=True))
191
+ if li_text_norm in table_names_normalized:
192
+ continue
193
+
194
+ text = element.get_text(" ", strip=True)
195
+ if text: # Only include non-empty text
196
+ text_elements.append(text)
197
+
198
+ return "\n\n".join(text_elements)
199
 
200
 
201
  def extract_tables(soup: BeautifulSoup) -> list[dict]:
 
243
  return tables
244
 
245
 
246
+ def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
247
  """Combine text + tables into a single string for LLM input."""
248
  output = []
249
  output.append("=== ARTICLE TEXT ===\n")
250
  output.append(text)
251
 
 
252
  excluded = {_normalize_title(s) for s in sections_to_exclude}
253
  filtered_tables = [
254
  t for t in tables if _normalize_title(t.get("name", "")) not in excluded
 
287
  }
288
 
289
 
 
290
  @tool
291
  def read_wikipedia_page(
292
  url: str,
293
+ sections_to_exclude: list[str] = [
294
+ "External links",
295
+ "References",
296
+ "Further reading",
297
+ "See also",
298
+ "Notes",
299
+ ]) -> str:
 
 
300
  """
301
+ Read a Wikipedia page and return a string with the text of the page.
302
  Args:
303
  url: The URL of the Wikipedia page to read.
304
+ sections_to_exclude: A list of sections to exclude from the page.
305
  Returns:
306
+ A string with the text of the page.
307
  """
308
  if "https://en.wikipedia.org/wiki/" not in url:
309
  raise ValueError("URL is required")
 
312
  # Parse the page
313
  soup = BeautifulSoup(html, "html.parser")
314
  # Remove unwanted sections
315
+ _remove_sections_by_titles(soup, sections_to_exclude)
316
 
317
  # Extract after pruning unwanted sections
318
  text = extract_text(soup)
319
  tables = extract_tables(soup)
320
 
321
  # Combine
322
+ llm_ready = format_for_llm(text, tables, sections_to_exclude)
323
  return llm_ready
tools/youtube_tools.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import subprocess
4
+ from yt_dlp import YoutubeDL
5
+
6
+
7
+
8
+ from smolagents.tools import tool
9
+
10
+ # Use FFmpeg to extract frames from the video
11
+ def extract_frames_with_ffmpeg(video_path: str, num_frames: int) -> [str]:
12
+ """Extract frames from video using FFmpeg"""
13
+ if not os.path.exists(video_path):
14
+ raise FileNotFoundError(f"Video file not found: {video_path}")
15
+
16
+ # Get video duration using ffprobe
17
+ duration_cmd = [
18
+ 'ffprobe', '-v', 'quiet', '-print_format', 'json',
19
+ '-show_format', video_path
20
+ ]
21
+
22
+ try:
23
+ result = subprocess.run(duration_cmd, capture_output=True, text=True, check=True)
24
+ import json
25
+ metadata = json.loads(result.stdout)
26
+ duration = float(metadata['format']['duration'])
27
+
28
+ # Calculate time intervals for frame extraction
29
+ time_intervals = [duration * i / (num_frames + 1) for i in range(1, num_frames + 1)]
30
+
31
+ extracted_files = []
32
+ for i, time_pos in enumerate(time_intervals):
33
+ output_filename = f"{os.path.splitext(os.path.basename(video_path))[0]}_frame_{i+1:03d}.jpg"
34
+
35
+ # Extract frame at specific time
36
+ ffmpeg_cmd = [
37
+ 'ffmpeg', '-i', video_path, '-ss', str(time_pos),
38
+ '-vframes', '1', '-q:v', '2', '-y', output_filename
39
+ ]
40
+
41
+ subprocess.run(ffmpeg_cmd, capture_output=True, check=True)
42
+ extracted_files.append(output_filename)
43
+
44
+ return extracted_files
45
+
46
+ except subprocess.CalledProcessError as e:
47
+ print(f"Error running FFmpeg: {e}")
48
+ return []
49
+ except Exception as e:
50
+ print(f"Error: {e}")
51
+ return []
52
+
53
+
54
+ @tool
55
+ def download_youtube_url_audio(url: str) -> str:
56
+ """
57
+ Download a YouTube video and return the path to the downloaded file.
58
+
59
+ Args:
60
+ url (str): The URL of the YouTube video to download.
61
+
62
+ Returns:
63
+ str: The path to the downloaded file.
64
+ """
65
+ ydl_audio_opts = {
66
+ 'format': 'bestaudio/best',
67
+ 'postprocessors': [{
68
+ 'key': 'FFmpegExtractAudio',
69
+ 'preferredcodec': 'mp3',
70
+ 'preferredquality': '192',
71
+ }],
72
+ 'quiet': True,
73
+ 'no_verbose_header': True,
74
+ 'no_warnings': True,
75
+ }
76
+
77
+ with YoutubeDL(ydl_audio_opts) as ydl:
78
+ file_path = ydl.extract_info(url)
79
+ return file_path['requested_downloads'][0]['filepath']
80
+
81
+
82
+ @tool
83
+ def download_youtube_url_images(url: str, num_images: int = 3) -> str:
84
+ """
85
+ Download a YouTube video and return the path to the downloaded file.
86
+
87
+ Args:
88
+ url (str): The URL of the YouTube video to download.
89
+ num_images (int): The number of images to download.
90
+
91
+ Returns:
92
+ str: The different paths to the downloaded files, separated by newlines.
93
+ """
94
+ # First, download the video
95
+ ydl_images_opts = {
96
+ 'format': 'best[height<=720]', # Download video in reasonable quality
97
+ 'outtmpl': '%(title)s.%(ext)s', # Save with title as filename
98
+ 'quiet': True,
99
+ 'no_verbose_header': True,
100
+ 'no_warnings': True,
101
+ }
102
+
103
+ with YoutubeDL(ydl_images_opts) as ydl:
104
+ info = ydl.extract_info(url, download=True)
105
+ video_filepath = ydl.prepare_filename(info)
106
+
107
+ # Extract frames from the downloaded video
108
+ if os.path.exists(video_filepath):
109
+ extracted_frames = extract_frames_with_ffmpeg(video_filepath, num_images)
110
+ return "\n".join(extracted_frames)
111
+
112
+ return ""
113
+
wiki_extractor.py DELETED
@@ -1,341 +0,0 @@
1
- import re
2
- import sys
3
- import json
4
- from typing import Any, Dict, List, Optional
5
-
6
- import requests
7
- from bs4 import BeautifulSoup, Tag
8
- from markdownify import markdownify as md
9
-
10
-
11
- USER_AGENT = (
12
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
13
- "AppleWebKit/537.36 (KHTML, like Gecko) "
14
- "Chrome/126.0.0.0 Safari/537.36"
15
- )
16
-
17
-
18
- def _clean_text(text: str) -> str:
19
- if not text:
20
- return ""
21
- # Remove reference markers like [1], [a], [note 1]
22
- text = re.sub(r"\s*\[[^\]]+\]", "", text)
23
- # Collapse whitespace
24
- text = re.sub(r"\s+", " ", text).strip()
25
- return text
26
-
27
-
28
- def _previous_heading(element: Tag) -> Optional[str]:
29
- node = element
30
- while node is not None:
31
- node = node.previous_sibling
32
- if isinstance(node, Tag) and node.name in {"h2", "h3", "h4", "h5", "h6"}:
33
- return _clean_text(node.get_text(" ", strip=True))
34
- # Fallback: walk up to find any earlier heading in parent
35
- parent = element.parent
36
- while parent is not None and isinstance(parent, Tag):
37
- sib = parent.previous_sibling
38
- while sib is not None:
39
- if isinstance(sib, Tag) and sib.name in {"h2", "h3", "h4", "h5", "h6"}:
40
- return _clean_text(sib.get_text(" ", strip=True))
41
- sib = sib.previous_sibling
42
- parent = parent.parent
43
- return None
44
-
45
-
46
- def _parse_table(table: Tag) -> Dict[str, Any]:
47
- # Determine title/caption
48
- title = None
49
- caption = table.find("caption")
50
- if caption:
51
- title = _clean_text(caption.get_text(" ", strip=True))
52
- if not title:
53
- title = _previous_heading(table)
54
-
55
- # Skip navboxes or non-content tables
56
- classes = set(table.get("class", []))
57
- if any(c in {"navbox", "vertical-navbox", "metadata", "mbox"} for c in classes):
58
- return {
59
- "name": title or "",
60
- "headers": [],
61
- "rows": [],
62
- "skipped": True,
63
- }
64
-
65
- # Identify header cells
66
- headers: List[str] = []
67
- header_row = None
68
- thead = table.find("thead")
69
- if thead:
70
- header_row = thead.find("tr")
71
- if header_row is None:
72
- # Find first row that contains any <th>
73
- for tr in table.find_all("tr", recursive=True):
74
- if tr.find("th"):
75
- header_row = tr
76
- break
77
- if header_row is not None:
78
- for th in header_row.find_all(["th", "td"], recursive=False):
79
- header_text = _clean_text(th.get_text(" ", strip=True))
80
- if header_text:
81
- headers.append(header_text)
82
-
83
- # Collect rows
84
- rows: List[Any] = []
85
- for tr in table.find_all("tr", recursive=True):
86
- if tr is header_row:
87
- continue
88
- cells = tr.find_all(["td", "th"], recursive=False)
89
- if not cells:
90
- continue
91
- values = [_clean_text(c.get_text(" ", strip=True)) for c in cells]
92
- # If headers exist and lengths match, map to dict; else keep as list
93
- if headers and len(values) == len(headers):
94
- rows.append({headers[i]: values[i] for i in range(len(headers))})
95
- else:
96
- rows.append(values)
97
-
98
- return {
99
- "name": title or "",
100
- "headers": headers,
101
- "rows": rows,
102
- }
103
-
104
-
105
- def extract_wikipedia_content(url: str) -> Dict[str, Any]:
106
- resp = requests.get(
107
- url,
108
- headers={"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"},
109
- timeout=30,
110
- )
111
- resp.raise_for_status()
112
-
113
- soup = BeautifulSoup(resp.text, "html.parser")
114
- title_tag = soup.find("h1", id="firstHeading")
115
- title = _clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
116
-
117
- # Main content
118
- content = soup.select_one("#mw-content-text .mw-parser-output")
119
- if content is None:
120
- content = soup.find("div", class_="mw-parser-output") or soup
121
-
122
- # Remove non-content elements
123
- for selector in [
124
- "table.navbox",
125
- "table.vertical-navbox",
126
- "div.reflist",
127
- "ol.references",
128
- "span.mw-editsection",
129
- "script",
130
- "style",
131
- "div.mw-authority-control",
132
- "div.navbox",
133
- "table.metadata",
134
- "table.toccolours",
135
- "div.mw-references-wrap",
136
- "sup.reference",
137
- ]:
138
- for node in content.select(selector):
139
- node.decompose()
140
-
141
- # Extract tables (keep real tables) by walking all descendants to capture nearest heading context
142
- tables: List[Dict[str, Any]] = []
143
- current_heading: Optional[str] = None
144
- for node in content.descendants:
145
- if not isinstance(node, Tag):
146
- continue
147
- if node.name in {"h2", "h3", "h4", "h5", "h6"}:
148
- headline = node.find("span", class_="mw-headline")
149
- heading_text = headline.get_text(" ", strip=True) if headline else node.get_text(" ", strip=True)
150
- current_heading = _clean_text(heading_text)
151
- continue
152
- if node.name == "table":
153
- classes = set(node.get("class", []))
154
- if not classes or any(c in {"wikitable", "infobox", "sortable", "vevent"} for c in classes):
155
- parsed = _parse_table(node)
156
- if parsed.get("rows"):
157
- if not parsed.get("name") and current_heading:
158
- parsed["name"] = current_heading
159
- tables.append({k: v for k, v in parsed.items() if k != "skipped"})
160
-
161
- # Extract text markdown excluding tables
162
- # Clone by stringifying and re-parsing only the content, then drop tables
163
- content_clone = BeautifulSoup(str(content), "html.parser")
164
- for tbl in content_clone.find_all("table"):
165
- tbl.decompose()
166
- text_markdown = md(str(content_clone), strip=['img'])
167
- text_markdown = _clean_text(text_markdown)
168
-
169
- return {
170
- "title": title,
171
- "url": url,
172
- "text_markdown": text_markdown,
173
- "tables": tables,
174
- }
175
-
176
-
177
- def _escape_markdown_cell(value: Any) -> str:
178
- """Escape characters that break Markdown tables and normalize whitespace."""
179
- if value is None:
180
- return ""
181
- text = str(value)
182
- text = text.replace("|", "\\|")
183
- text = re.sub(r"\s+", " ", text).strip()
184
- return text
185
-
186
-
187
- def format_tables_as_markdown(
188
- tables: List[Dict[str, Any]],
189
- max_tables: Optional[int] = None,
190
- max_rows_per_table: int = 25,
191
- ) -> str:
192
- """
193
- Convert extracted tables into compact Markdown tables.
194
-
195
- Args:
196
- tables: List of table dicts as returned by extract_wikipedia_content.
197
- max_tables: If set, include at most this many tables (in order).
198
- max_rows_per_table: Maximum number of data rows to include per table.
199
-
200
- Returns:
201
- A Markdown string representing the tables.
202
- """
203
- if not tables:
204
- return ""
205
-
206
- rendered_sections: List[str] = []
207
- selected = tables if max_tables is None else tables[: max_tables]
208
-
209
- for table_idx, table in enumerate(selected):
210
- name = table.get("name") or f"Table {table_idx + 1}"
211
- headers: List[str] = table.get("headers", [])
212
- rows: List[Any] = table.get("rows", [])
213
-
214
- if not rows:
215
- continue
216
-
217
- section_lines: List[str] = []
218
- section_lines.append(f"### Table: {name}")
219
-
220
- # If we have headers and row dicts/lists, render a markdown table
221
- if headers:
222
- # Header row
223
- escaped_headers = [_escape_markdown_cell(h) for h in headers]
224
- section_lines.append("| " + " | ".join(escaped_headers) + " |")
225
- section_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
226
-
227
- # Data rows
228
- for r_idx, row in enumerate(rows[: max_rows_per_table]):
229
- if isinstance(row, dict):
230
- values = [_escape_markdown_cell(row.get(h, "")) for h in headers]
231
- else:
232
- # row is a list; align to headers length
233
- values = [_escape_markdown_cell(row[i] if i < len(row) else "") for i in range(len(headers))]
234
- section_lines.append("| " + " | ".join(values) + " |")
235
- else:
236
- # No headers: render as bullet list with row previews
237
- for r_idx, row in enumerate(rows[: max_rows_per_table]):
238
- if isinstance(row, dict):
239
- preview = ", ".join(f"{_escape_markdown_cell(k)}: {_escape_markdown_cell(v)}" for k, v in row.items())
240
- else:
241
- preview = ", ".join(_escape_markdown_cell(v) for v in row)
242
- section_lines.append(f"- {preview}")
243
-
244
- # Indicate truncation if applicable
245
- if len(rows) > max_rows_per_table:
246
- section_lines.append(f"… ({len(rows) - max_rows_per_table} more rows omitted)")
247
-
248
- rendered_sections.append("\n".join(section_lines))
249
-
250
- return "\n\n".join(rendered_sections)
251
-
252
-
253
- def format_extracted_content(
254
- data: Dict[str, Any],
255
- include_url: bool = True,
256
- max_tables: Optional[int] = None,
257
- max_rows_per_table: int = 25,
258
- ) -> str:
259
- """
260
- Combine `text_markdown` and `tables` from extract_wikipedia_content into an LLM-friendly Markdown string.
261
-
262
- Args:
263
- data: Dict returned by extract_wikipedia_content.
264
- include_url: Whether to include the source URL at the top.
265
- max_tables: If set, include at most this many tables.
266
- max_rows_per_table: Maximum number of data rows per table.
267
-
268
- Returns:
269
- Markdown string ready to feed into an LLM.
270
- """
271
- if not data:
272
- return ""
273
-
274
- title = data.get("title") or ""
275
- url = data.get("url") or ""
276
- text_md = data.get("text_markdown") or ""
277
- tables = data.get("tables") or []
278
-
279
- parts: List[str] = []
280
- if title:
281
- parts.append(f"# {title}")
282
- if include_url and url:
283
- parts.append(f"Source: {url}")
284
-
285
- if text_md:
286
- parts.append("## Article")
287
- parts.append(text_md)
288
-
289
- tables_md = format_tables_as_markdown(tables, max_tables=max_tables, max_rows_per_table=max_rows_per_table)
290
- if tables_md:
291
- parts.append("## Tables")
292
- parts.append(tables_md)
293
-
294
- return "\n\n".join(p for p in parts if p)
295
-
296
-
297
- def main() -> None:
298
- if len(sys.argv) < 2:
299
- print("Usage: python wiki_extractor.py <wikipedia_url>")
300
- sys.exit(1)
301
- url = sys.argv[1]
302
- data = extract_wikipedia_content(url)
303
-
304
- print(json.dumps({
305
- "title": data["title"],
306
- "url": data["url"],
307
- "num_tables": len(data["tables"]),
308
- "table_names": [t.get("name", "") for t in data["tables"]][:20],
309
- }, ensure_ascii=False, indent=2))
310
-
311
- # Try to locate Studio albums table and print first 3 rows
312
- studio_tables = [
313
- t for t in data["tables"]
314
- if "studio albums" in (t.get("name", "").lower())
315
- or any("studio albums" in (cap.lower()) for cap in [t.get("name", "")])
316
- ]
317
- if studio_tables:
318
- t0 = studio_tables[0]
319
- print("\nFound 'Studio albums' table. Headers:")
320
- print(t0.get("headers", []))
321
- print("First 3 rows:")
322
- for row in t0.get("rows", [])[:3]:
323
- print(row)
324
- else:
325
- # Heuristic: print first wikitable under any heading that contains 'albums'
326
- albums_like = [
327
- t for t in data["tables"] if "albums" in t.get("name", "").lower()
328
- ]
329
- if albums_like:
330
- t0 = albums_like[0]
331
- print("\nFound albums-related table. Headers:")
332
- print(t0.get("headers", []))
333
- print("First 3 rows:")
334
- for row in t0.get("rows", [])[:3]:
335
- print(row)
336
-
337
-
338
- if __name__ == "__main__":
339
- main()
340
-
341
-