|
|
|
""" |
|
Hybrid GAIA Agent combining the best features from both GAIAAgent and MultimodalGAIAAgent |
|
""" |
|
import os |
|
import re |
|
import logging |
|
from typing import List, Dict, Any, Optional, Union |
|
import requests |
|
from pathlib import Path |
|
import mimetypes |
|
|
|
|
|
from google import genai |
|
from google.genai import types |
|
import PIL.Image |
|
|
|
|
|
from search_tools import SearchTools |
|
from llm import LLMClient |
|
from code_agent import CodeInterpreter |
|
from youtube_tools import YouTubeTools |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class HybridGAIAAgent: |
|
"""Hybrid GAIA Agent with both universal LLM approach and multimodal capabilities""" |
|
|
|
def __init__(self): |
|
"""Initialize the hybrid agent""" |
|
self.search_tools = SearchTools() |
|
self.llm_client = LLMClient() |
|
self.code_interpreter = CodeInterpreter() |
|
self.youtube_tools = YouTubeTools() |
|
|
|
|
|
api_key = os.getenv('GOOGLE_API_KEY') |
|
if not api_key: |
|
logger.warning("GOOGLE_API_KEY not found. Multimodal features will be limited.") |
|
self.gemini_client = None |
|
else: |
|
self.gemini_client = genai.Client(api_key=api_key) |
|
logger.info("Gemini client initialized for multimodal processing") |
|
|
|
|
|
self.supported_extensions = { |
|
|
|
'.jpg': 'image', '.jpeg': 'image', '.png': 'image', '.gif': 'image', |
|
'.bmp': 'image', '.webp': 'image', '.tiff': 'image', |
|
|
|
'.mp3': 'audio', '.wav': 'audio', '.m4a': 'audio', '.aac': 'audio', |
|
'.ogg': 'audio', '.flac': 'audio', |
|
|
|
'.mp4': 'video', '.avi': 'video', '.mov': 'video', '.mkv': 'video', |
|
'.webm': 'video', '.wmv': 'video', |
|
|
|
'.pdf': 'document', '.txt': 'document', '.docx': 'document', |
|
|
|
'.xlsx': 'spreadsheet', '.xls': 'spreadsheet', '.csv': 'spreadsheet', |
|
|
|
'.py': 'code', '.js': 'code', '.html': 'code', '.css': 'code', |
|
'.java': 'code', '.cpp': 'code', '.c': 'code' |
|
} |
|
|
|
self.system_prompt = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with your final answer. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. |
|
|
|
IMPORTANT: For reverse/word puzzle questions, think carefully about what is being asked: |
|
- If asked to "reverse" a string that contains words, first reverse the string literally, then understand what it says |
|
- If the reversed string says something like "'left' as the answer", the actual answer should be the opposite concept (e.g., "right") |
|
- For mathematical tables or logical puzzles, analyze the pattern carefully |
|
|
|
For factual questions with context: Use the available information to provide the best possible answer, even if the information is not perfectly complete. Try to extract useful details from the context. |
|
|
|
For music questions: When counting albums, distinguish between: |
|
- Studio albums (original recordings in a studio) |
|
- Live albums (concert recordings, often marked as "Live", "En Vivo", "Acústico") |
|
- Compilation albums (collections of existing songs, "Greatest Hits", "Best of") |
|
- Awards (Grammy awards are NOT albums) |
|
- If you see album titles with years, count them carefully for the specified time period |
|
- If an album is described as "double album" with two parts (like "Cantora 1" and "Cantora 2"), count it as ONE album, not two |
|
- Look for explicit mentions of "studio album" or context clues about recording type |
|
|
|
CRITICAL: Your response should be ONLY the final answer - no explanations, no reasoning, no additional text. Just the direct answer to the question. |
|
|
|
Do NOT use "FINAL ANSWER:" prefix in your response. Just provide the answer directly.""" |
|
|
|
def detect_file_references(self, question: str) -> List[Dict[str, str]]: |
|
"""Detect file references in the question""" |
|
files = [] |
|
|
|
|
|
if any(pattern in question.lower() for pattern in [ |
|
'given this table', 'table defining', '|*|', '|---|' |
|
]): |
|
return files |
|
|
|
|
|
patterns = [ |
|
|
|
r'(?:file|in the file|from the file)\s+([a-zA-Z0-9_/-]+/[a-zA-Z0-9_.-]+\.[a-zA-Z0-9]+)', |
|
|
|
r'(?:attached|provided|given|included)\s+(?:file|image|video|audio|document|Excel file|Python code)(?:\s+called\s+)?(?:\s+["\']?([^"\'.\s]+\.[a-zA-Z0-9]+)["\']?)?', |
|
|
|
r'([a-zA-Z0-9_/-]+/[a-zA-Z0-9_.-]+\.[a-zA-Z0-9]+)', |
|
|
|
r'([a-zA-Z0-9_-]+\.[a-zA-Z0-9]+)', |
|
|
|
r'(https?://(?:www\.)?youtube\.com/watch\?v=[\w-]+)', |
|
r'(https?://youtu\.be/[\w-]+)', |
|
|
|
r'(https?://[^\s]+\.(?:jpg|jpeg|png|gif|mp4|mp3|wav|pdf|xlsx|xls|csv))', |
|
] |
|
|
|
for pattern in patterns: |
|
matches = re.findall(pattern, question, re.IGNORECASE) |
|
for match in matches: |
|
if match: |
|
file_info = self._analyze_file_reference(match, question) |
|
if file_info: |
|
files.append(file_info) |
|
|
|
|
|
if any(keyword in question.lower() for keyword in [ |
|
'attached', 'provided', 'given', 'image', 'video', 'audio', |
|
'excel file', 'python code', 'recording', 'picture' |
|
]): |
|
|
|
if not any(indicator in question.lower() for indicator in [ |
|
'given this table', 'table defining', '|*|', '|---|' |
|
]): |
|
if not files: |
|
files.append({ |
|
'name': 'unknown_file', |
|
'type': 'unknown', |
|
'source': 'attachment', |
|
'available': False |
|
}) |
|
|
|
return files |
|
|
|
def _analyze_file_reference(self, file_ref: str, question: str) -> Optional[Dict[str, str]]: |
|
"""Analyze a file reference and determine its type""" |
|
file_ref = file_ref.strip() |
|
|
|
|
|
if 'youtube.com' in file_ref or 'youtu.be' in file_ref: |
|
return { |
|
'name': file_ref, |
|
'type': 'video', |
|
'source': 'youtube', |
|
'available': True |
|
} |
|
|
|
|
|
if '.' in file_ref: |
|
ext = '.' + file_ref.split('.')[-1].lower() |
|
file_type = self.supported_extensions.get(ext, 'unknown') |
|
|
|
return { |
|
'name': file_ref, |
|
'type': file_type, |
|
'source': 'attachment', |
|
'available': self._check_file_availability(file_ref) |
|
} |
|
|
|
return None |
|
|
|
def _check_file_availability(self, filename: str) -> bool: |
|
"""Check if a file is available locally""" |
|
|
|
if Path(filename).exists(): |
|
return True |
|
|
|
|
|
search_paths = [ |
|
Path('.'), |
|
Path('./files'), |
|
Path('./data'), |
|
Path('./attachments'), |
|
Path('./uploads'), |
|
Path('./images'), |
|
Path('./docs'), |
|
Path('./scripts'), |
|
Path('./reports') |
|
] |
|
|
|
|
|
base_filename = Path(filename).name |
|
|
|
for path in search_paths: |
|
|
|
if (path / filename).exists(): |
|
return True |
|
|
|
if (path / base_filename).exists(): |
|
return True |
|
|
|
return False |
|
|
|
def process_multimodal_content(self, question: str, files: List[Dict[str, str]]) -> Optional[str]: |
|
"""Process multimodal content using Gemini API and YouTube tools""" |
|
if not self.gemini_client: |
|
logger.warning("Gemini client not available for multimodal processing") |
|
return None |
|
|
|
try: |
|
|
|
prompt_parts = [question] |
|
|
|
for file_info in files: |
|
if file_info['available']: |
|
if file_info['source'] == 'youtube': |
|
|
|
video_url = file_info['name'] |
|
logger.info(f"Processing YouTube video: {video_url}") |
|
|
|
video_analysis = self.youtube_tools.analyze_video(video_url) |
|
video_info = self.youtube_tools.format_video_info_for_llm(video_analysis) |
|
|
|
prompt_parts.append(f"\n\nYouTube Video Information:\n{video_info}") |
|
logger.info(f"Added YouTube video info to prompt: {file_info['name']}") |
|
|
|
else: |
|
|
|
file_path = self._find_file_path(file_info['name']) |
|
if file_path: |
|
if file_info['type'] == 'image': |
|
|
|
image = PIL.Image.open(file_path) |
|
prompt_parts.append(image) |
|
logger.info(f"Added image to prompt: {file_info['name']}") |
|
|
|
elif file_info['type'] in ['audio', 'video']: |
|
|
|
uploaded_file = self.gemini_client.files.upload(file=str(file_path)) |
|
prompt_parts.append(uploaded_file) |
|
logger.info(f"Uploaded {file_info['type']} to Gemini: {file_info['name']}") |
|
|
|
elif file_info['type'] in ['document', 'code', 'spreadsheet']: |
|
|
|
content = self._read_file_content(file_path) |
|
if content: |
|
prompt_parts.append(f"\n\nFile content ({file_info['name']}):\n{content}") |
|
logger.info(f"Added file content to prompt: {file_info['name']}") |
|
|
|
|
|
if len(prompt_parts) > 1: |
|
response = self.gemini_client.models.generate_content( |
|
model='gemini-2.0-flash', |
|
contents=prompt_parts, |
|
config=types.GenerateContentConfig( |
|
system_instruction=self.system_prompt, |
|
temperature=0.1 |
|
) |
|
) |
|
return response.text |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing multimodal content: {e}") |
|
return None |
|
|
|
return None |
|
|
|
def _find_file_path(self, filename: str) -> Optional[Path]: |
|
"""Find the full path of a file""" |
|
|
|
file_path = Path(filename) |
|
if file_path.exists(): |
|
return file_path |
|
|
|
|
|
search_paths = [ |
|
Path('.'), |
|
Path('./files'), |
|
Path('./data'), |
|
Path('./attachments'), |
|
Path('./uploads'), |
|
Path('./images'), |
|
Path('./docs'), |
|
Path('./scripts'), |
|
Path('./reports') |
|
] |
|
|
|
|
|
base_filename = Path(filename).name |
|
|
|
for path in search_paths: |
|
|
|
full_path = path / filename |
|
if full_path.exists(): |
|
return full_path |
|
|
|
base_path = path / base_filename |
|
if base_path.exists(): |
|
return base_path |
|
|
|
return None |
|
|
|
def _read_file_content(self, file_path: Path) -> Optional[str]: |
|
"""Read content from text-based files""" |
|
try: |
|
|
|
if file_path.suffix.lower() == '.pdf': |
|
|
|
try: |
|
import PyPDF2 |
|
with open(file_path, 'rb') as file: |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() + "\n" |
|
return text |
|
except ImportError: |
|
return f"[PDF file: {file_path.name} - PyPDF2 not available]" |
|
except Exception as e: |
|
return f"[PDF file: {file_path.name} - error reading: {e}]" |
|
|
|
elif file_path.suffix.lower() in ['.xlsx', '.xls']: |
|
|
|
try: |
|
import pandas as pd |
|
|
|
excel_file = pd.ExcelFile(file_path) |
|
content = f"Excel file: {file_path.name}\n" |
|
content += f"Sheets: {excel_file.sheet_names}\n\n" |
|
|
|
for sheet_name in excel_file.sheet_names: |
|
df = pd.read_excel(file_path, sheet_name=sheet_name) |
|
content += f"Sheet: {sheet_name}\n" |
|
content += df.to_string(index=False) + "\n\n" |
|
|
|
return content |
|
except ImportError: |
|
return f"[Excel file: {file_path.name} - pandas not available]" |
|
except Exception as e: |
|
return f"[Excel file: {file_path.name} - error reading: {e}]" |
|
|
|
elif file_path.suffix.lower() == '.csv': |
|
|
|
try: |
|
import pandas as pd |
|
df = pd.read_csv(file_path) |
|
return f"CSV file: {file_path.name}\n{df.to_string(index=False)}" |
|
except ImportError: |
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
except Exception as e: |
|
return f"[CSV file: {file_path.name} - error reading: {e}]" |
|
|
|
else: |
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
|
|
except Exception as e: |
|
logger.error(f"Error reading file {file_path}: {e}") |
|
return None |
|
|
|
def handle_simple_question(self, question: str) -> Optional[str]: |
|
"""Handle simple questions that don't require search""" |
|
|
|
files = self.detect_file_references(question) |
|
|
|
if files: |
|
|
|
for file_info in files: |
|
if file_info['source'] != 'youtube': |
|
file_info['available'] = self._check_file_availability(file_info['name']) |
|
|
|
unavailable_files = [f for f in files if not f['available']] |
|
available_files = [f for f in files if f['available']] |
|
|
|
logger.info(f"Files status - Available: {[f['name'] for f in available_files]}, Unavailable: {[f['name'] for f in unavailable_files]}") |
|
|
|
|
|
if any(f['source'] == 'youtube' for f in files): |
|
logger.info("Found YouTube video - processing with YouTube tools") |
|
youtube_files = [f for f in files if f['source'] == 'youtube'] |
|
multimodal_response = self.process_multimodal_content(question, youtube_files) |
|
if multimodal_response: |
|
return multimodal_response |
|
|
|
|
|
if unavailable_files and not available_files: |
|
logger.info("No files available, will try search instead") |
|
return None |
|
|
|
|
|
simple_patterns = [ |
|
r'\.rewsna eht sa', |
|
r'what is \d+\s*[\+\-\*\/]\s*\d+', |
|
r'given this table.*defining.*on the set', |
|
r'what is the opposite of', |
|
r'what does.*mean', |
|
r'how do you spell', |
|
r'what color is', |
|
r'what day is', |
|
] |
|
|
|
|
|
question_lower = question.lower() |
|
|
|
|
|
if any(indicator in question_lower for indicator in [ |
|
'given this table', 'table defining', '|*|', '|---|' |
|
]): |
|
logger.info("Detected mathematical table - handling directly with LLM") |
|
return self._generate_response_without_context(question) |
|
|
|
|
|
if any(re.search(pattern, question_lower) for pattern in simple_patterns): |
|
logger.info("Detected simple question pattern - handling directly with LLM") |
|
return self._generate_response_without_context(question) |
|
|
|
|
|
if any(keyword in question_lower for keyword in [ |
|
'grocery list', 'categorizing', 'vegetables', 'fruits', 'botanical' |
|
]): |
|
logger.info("Detected categorization question - handling directly with LLM") |
|
return self._generate_response_without_context(question) |
|
|
|
return None |
|
|
|
def analyze_question_type(self, question: str) -> Dict[str, Any]: |
|
"""Analyze question type and requirements""" |
|
analysis = { |
|
'has_files': False, |
|
'file_types': [], |
|
'is_olympics': 'olympics' in question.lower() or 'olympic' in question.lower(), |
|
'is_statistics': any(word in question.lower() for word in ['how many', 'number of', 'count', 'total']), |
|
'is_comparison': any(word in question.lower() for word in ['most', 'least', 'highest', 'lowest', 'before', 'after']), |
|
'has_year': bool(re.search(r'\b(19|20)\d{2}\b', question)), |
|
'year': None, |
|
'is_country': any(word in question.lower() for word in ['country', 'nation', 'ioc']), |
|
'needs_alphabetical': 'alphabetical' in question.lower(), |
|
'is_academic': any(word in question.lower() for word in ['paper', 'journal', 'research', 'study', 'arxiv']), |
|
'is_current_events': any(word in question.lower() for word in ['recent', 'latest', 'current', '2023', '2024']), |
|
'is_sports': any(word in question.lower() for word in ['baseball', 'yankee', 'pitcher', 'athlete']), |
|
'is_data_analysis': any(word in question.lower() for word in ['table', 'data', 'calculate', 'analyze']), |
|
'is_music': any(word in question.lower() for word in ['album', 'albums', 'song', 'music', 'artist', 'singer', 'musician', 'discography']) |
|
} |
|
|
|
|
|
year_match = re.search(r'\b(19|20)\d{2}\b', question) |
|
if year_match: |
|
analysis['year'] = year_match.group() |
|
|
|
|
|
files = self.detect_file_references(question) |
|
if files: |
|
analysis['has_files'] = True |
|
analysis['file_types'] = [f['type'] for f in files] |
|
|
|
return analysis |
|
|
|
def __call__(self, question: str) -> str: |
|
"""Main method to process a question""" |
|
logger.info(f"🔍 PROCESSING QUESTION: {question}") |
|
|
|
|
|
simple_answer = self.handle_simple_question(question) |
|
if simple_answer: |
|
logger.info(f"✅ Handled as simple/multimodal question") |
|
return simple_answer |
|
|
|
|
|
analysis = self.analyze_question_type(question) |
|
files = self.detect_file_references(question) |
|
|
|
|
|
if files: |
|
for file_info in files: |
|
if file_info['source'] != 'youtube': |
|
file_info['available'] = self._check_file_availability(file_info['name']) |
|
|
|
available_files = [f for f in files if f['available']] |
|
if available_files: |
|
logger.info(f"📁 Found {len(available_files)} available files: {[f['name'] for f in available_files]}") |
|
|
|
multimodal_response = self.process_multimodal_content(question, available_files) |
|
if multimodal_response: |
|
logger.info("✅ Successfully processed with multimodal content") |
|
return multimodal_response |
|
|
|
logger.info(f"📊 Question type analysis: {analysis}") |
|
|
|
|
|
|
|
simple_question_indicators = [ |
|
'given this table', 'table defining', '|*|', '|---|', |
|
'.rewsna eht sa', |
|
'grocery list', 'categorizing', 'vegetables', 'fruits', 'botanical' |
|
] |
|
|
|
is_simple_question = any(indicator in question.lower() for indicator in simple_question_indicators) |
|
|
|
|
|
|
|
|
|
|
|
search_needed = not is_simple_question and ( |
|
not analysis['has_files'] or |
|
any(analysis[key] for key in [ |
|
'is_olympics', 'is_statistics', 'is_academic', 'is_current_events', 'is_sports', 'is_music' |
|
]) or |
|
(analysis['has_files'] and files and not any(f['available'] for f in files)) |
|
) |
|
|
|
logger.info(f"🔎 Search needed: {search_needed} (simple_question: {is_simple_question}, has_files: {analysis['has_files']})") |
|
|
|
context = "" |
|
|
|
if search_needed: |
|
|
|
if analysis['is_academic']: |
|
logger.info("📚 Academic question - trying arxiv and web") |
|
context = self._search_academic(question) |
|
elif analysis['is_olympics']: |
|
logger.info("🏅 Olympics question - trying multiple specific searches") |
|
context = self._search_olympics(question) |
|
elif analysis['is_music']: |
|
logger.info("🎵 Music question - trying web search first, then Wikipedia") |
|
context = self._search_music(question) |
|
else: |
|
logger.info("🌐 General factual question - trying multiple sources") |
|
context = self._search_general(question) |
|
|
|
|
|
if context: |
|
logger.info(f"✅ Found context using search") |
|
logger.info(f"📄 Context found ({len(context)} characters)") |
|
response = self._generate_response_with_context(question, context) |
|
else: |
|
logger.info("❌ No context found - relying on LLM knowledge") |
|
response = self._generate_response_without_context(question) |
|
|
|
return response |
|
|
|
def _search_academic(self, question: str) -> str: |
|
"""Search academic sources""" |
|
try: |
|
arxiv_results = self.search_tools.search_arxiv(question) |
|
if arxiv_results: |
|
logger.info("arxiv search found results in arxiv_results") |
|
return arxiv_results |
|
except Exception as e: |
|
logger.error(f"Arxiv search failed: {e}") |
|
|
|
|
|
return self._search_web(question) |
|
|
|
def _search_olympics(self, question: str) -> str: |
|
"""Search for Olympics-related information""" |
|
|
|
search_queries = [ |
|
question, |
|
"1928 Summer Olympics participating countries athletes count", |
|
"1928 Amsterdam Olympics countries delegation size", |
|
"1928 Olympics smallest delegation country IOC code" |
|
] |
|
|
|
for query in search_queries: |
|
try: |
|
logger.info(f"Trying Olympics search: {query}") |
|
web_results = self.search_tools.search_web(query) |
|
if web_results and len(web_results) > 100: |
|
logger.info(f"Found Olympics web results for: {query}") |
|
return web_results |
|
except Exception as e: |
|
logger.error(f"Olympics web search failed for '{query}': {e}") |
|
|
|
|
|
wiki_queries = [ |
|
"1928 Summer Olympics", |
|
"1928 Summer Olympics participating nations", |
|
"Amsterdam 1928 Olympics countries" |
|
] |
|
|
|
for query in wiki_queries: |
|
try: |
|
logger.info(f"Trying Olympics Wikipedia search: {query}") |
|
wiki_results = self.search_tools.search_wikipedia(query) |
|
if wiki_results and len(wiki_results) > 100: |
|
logger.info(f"Found Olympics Wikipedia results for: {query}") |
|
return wiki_results |
|
except Exception as e: |
|
logger.error(f"Olympics Wikipedia search failed for '{query}': {e}") |
|
|
|
return "" |
|
|
|
def _search_music(self, question: str) -> str: |
|
"""Search for music-related information using web search first, then Wikipedia""" |
|
|
|
artist_patterns = [ |
|
r'by ([A-Z][a-zA-Z\s]+?)(?:\s+between|\s+from|\s+in|\?|$)', |
|
r'([A-Z][a-zA-Z\s]+?)\s+(?:albums|songs|music)', |
|
] |
|
|
|
artist_name = None |
|
for pattern in artist_patterns: |
|
match = re.search(pattern, question) |
|
if match: |
|
artist_name = match.group(1).strip() |
|
break |
|
|
|
|
|
web_queries = [] |
|
|
|
if artist_name: |
|
web_queries = [ |
|
f"{artist_name} studio albums discography 2000-2009", |
|
f"{artist_name} complete discography studio albums", |
|
question |
|
] |
|
else: |
|
web_queries = [question] |
|
|
|
|
|
for query in web_queries: |
|
try: |
|
logger.info(f"Trying web search for music: {query}") |
|
web_results = self.search_tools.search_web(query) |
|
if web_results and len(web_results) > 100: |
|
logger.info(f"Found music web results for: {query}") |
|
return web_results |
|
except Exception as e: |
|
logger.error(f"Web music search failed for '{query}': {e}") |
|
|
|
|
|
wiki_queries = [] |
|
if artist_name: |
|
wiki_queries = [ |
|
f"{artist_name} discography", |
|
f"{artist_name} albums", |
|
f"{artist_name} studio albums", |
|
artist_name |
|
] |
|
else: |
|
wiki_queries = [question] |
|
|
|
for query in wiki_queries: |
|
try: |
|
logger.info(f"Trying Wikipedia API music search: {query}") |
|
wiki_api_results = self.search_tools.search_wikipedia_api(query) |
|
if wiki_api_results and len(wiki_api_results) > 100 and "No results found" not in wiki_api_results: |
|
logger.info(f"Found music Wikipedia API results for: {query}") |
|
return wiki_api_results |
|
except Exception as e: |
|
logger.error(f"Wikipedia API music search failed for '{query}': {e}") |
|
|
|
|
|
for query in wiki_queries: |
|
try: |
|
logger.info(f"Trying regular Wikipedia music search: {query}") |
|
wiki_results = self.search_tools.search_wikipedia(query) |
|
if wiki_results and len(wiki_results) > 100: |
|
logger.info(f"Found music Wikipedia results for: {query}") |
|
return wiki_results |
|
except Exception as e: |
|
logger.error(f"Wikipedia music search failed for '{query}': {e}") |
|
|
|
return "" |
|
|
|
def _search_general(self, question: str) -> str: |
|
"""General search strategy""" |
|
|
|
web_results = self._search_web(question) |
|
if web_results: |
|
return web_results |
|
|
|
|
|
try: |
|
wiki_results = self.search_tools.search_wikipedia(question) |
|
if wiki_results: |
|
logger.info("wikipedia search found results in wiki_results") |
|
return wiki_results |
|
except Exception as e: |
|
logger.error(f"Wikipedia search failed: {e}") |
|
|
|
return "" |
|
|
|
def _search_web(self, question: str) -> str: |
|
"""Perform web search""" |
|
try: |
|
logger.info(f"Using web search for query: {question}") |
|
web_results = self.search_tools.search_web(question) |
|
if web_results: |
|
logger.info("web search found results in web_results") |
|
return web_results |
|
except Exception as e: |
|
logger.error(f"Web search failed: {e}") |
|
|
|
return "" |
|
|
|
def _generate_response_with_context(self, question: str, context: str) -> str: |
|
"""Generate response using found context""" |
|
logger.info(f"🤖 Sending to LLM (prompt length: {len(self.system_prompt + question + context)} chars)") |
|
logger.info(f"🤖 Context preview: {context[:200]}...") |
|
|
|
try: |
|
response = self.llm_client.generate_response( |
|
question=question, |
|
context=context, |
|
system_prompt=self.system_prompt |
|
) |
|
|
|
logger.info(f"🤖 LLM raw response: {response}") |
|
|
|
|
|
formatted_response = self._ensure_final_answer_format(response) |
|
return formatted_response |
|
|
|
except Exception as e: |
|
logger.error(f"Error generating response with context: {e}") |
|
logger.warning(f"❓ Defaulting to 'I don't know'") |
|
return "FINAL ANSWER: I don't know" |
|
|
|
def _generate_response_without_context(self, question: str) -> str: |
|
"""Generate response without external context""" |
|
logger.info(f"🤖 Sending to LLM (prompt length: {len(self.system_prompt + question)} chars)") |
|
logger.info(f"🤖 No context provided") |
|
|
|
try: |
|
response = self.llm_client.generate_response( |
|
question=question, |
|
context="", |
|
system_prompt=self.system_prompt |
|
) |
|
|
|
logger.info(f"🤖 LLM raw response: {response}") |
|
|
|
|
|
formatted_response = self._ensure_final_answer_format(response) |
|
return formatted_response |
|
|
|
except Exception as e: |
|
logger.error(f"Error generating response without context: {e}") |
|
logger.warning(f"❓ Defaulting to 'I don't know'") |
|
return "FINAL ANSWER: I don't know" |
|
|
|
def _ensure_final_answer_format(self, response: str) -> str: |
|
"""Ensure response is clean and properly formatted""" |
|
if not response: |
|
return "I don't know" |
|
|
|
|
|
if "FINAL ANSWER:" in response: |
|
parts = response.split("FINAL ANSWER:") |
|
if len(parts) > 1: |
|
response = parts[-1].strip() |
|
|
|
|
|
uncertainty_phrases = [ |
|
"i don't know", "i do not know", "unknown", "i cannot answer", |
|
"cannot determine", "not enough information", "unclear", "uncertain", |
|
"this question cannot be answered" |
|
] |
|
|
|
if any(phrase in response.strip().lower() for phrase in uncertainty_phrases): |
|
return "I don't know" |
|
|
|
|
|
lines = response.strip().split('\n') |
|
if len(lines) > 1: |
|
|
|
for line in reversed(lines): |
|
line = line.strip() |
|
if line and not line.startswith(('Based on', 'According to', 'The answer is', 'From the')): |
|
|
|
if len(line.split()) <= 5 or line.replace(',', '').replace(' ', '').isalnum(): |
|
response = line |
|
break |
|
|
|
|
|
clean_response = response.strip() |
|
logger.info(f"✅ Clean response: {clean_response}") |
|
return clean_response |