|
import os |
|
import json |
|
import asyncio |
|
from typing import List, Dict, Any, Optional |
|
from langchain.prompts import ChatPromptTemplate |
|
from pathlib import Path |
|
from dotenv import load_dotenv |
|
import time |
|
from langchain_community.tools import BraveSearch |
|
from src.utils.api_key_manager import with_api_manager |
|
from src.helpers.helper import remove_markdown |
|
|
|
class SearchEngine: |
|
def __init__( |
|
self, |
|
brave_api_key: Optional[str] = None, |
|
): |
|
if brave_api_key is None: |
|
if os.getenv("BRAVE_API_KEY") is None: |
|
raise ValueError("BRAVE_API_KEY is not set") |
|
else: |
|
self.brave_api_key = os.getenv("BRAVE_API_KEY") |
|
else: |
|
self.brave_api_key = brave_api_key |
|
|
|
@with_api_manager() |
|
async def generate_optimized_query(self, user_query: str, context: str = None, *, llm) -> str: |
|
if context: |
|
template = \ |
|
"""Objective: |
|
Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query and relevant past context. |
|
The generated SEO query should enhance visibility, relevance, and ranking on search engines. |
|
|
|
Information: |
|
The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices. |
|
|
|
Instructions: |
|
1. Understand the Inputs: |
|
- User Query: This is the current question or statement provided by the user. |
|
- Past Context: This includes any relevant previous interactions, preferences, or information that can inform the understanding of the user's intent. |
|
2. Analyze the User Intent: |
|
- Determine what the user is seeking to find or achieve with their query. |
|
- Identify keywords and phrases that are central to the user's intent. |
|
3. Incorporate SEO Best Practices: |
|
- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords. |
|
- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely. |
|
- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs. |
|
- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation). |
|
- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing. |
|
4. Generate the SEO-Optimized Query: |
|
- Combine the insights from the user query and past context. |
|
- Formulate a search query that maximizes SEO potential while staying true to the user's intent. |
|
5. Review and Refine: |
|
- Ensure the generated query is free from grammatical errors. |
|
- Verify that the query does not include unnecessary or irrelevant keywords. |
|
- Confirm that the query is tailored to improve search engine rankings for the intended content. |
|
6. Format [IMPORTANT]: |
|
- If the user query is a question, the SEO-optimized query should also be a question. |
|
- If the user query is a statement, the SEO-optimized query should be a clear and concise statement. |
|
- Unless search results would be more accurate if the optimized query was a question. |
|
|
|
Example 1: |
|
- User Query: |
|
'Best vegan restaurants in New York' |
|
- Past Context: |
|
'User has previously shown interest in healthy eating and sustainability.' |
|
- SEO-Optimized Search Query: |
|
'Top Vegan Restaurants in New York City for Healthy Dining' |
|
|
|
Example 2: |
|
- User Query: |
|
'Give me a list of the best sci-fi movies' |
|
- Past Context: |
|
'User has a preference for classic science fiction films. Previous searches include "Blade Runner" and "2001: A Space Odyssey."' |
|
- SEO-Optimized Search Query: |
|
'What are the top classic science fiction movies to watch that are similar to Blade Runner and 2001: A Space Odyssey?' |
|
|
|
Input: |
|
- User Query: |
|
{user_query} |
|
|
|
- Past Context: |
|
{context} |
|
|
|
Output: |
|
(The generated SEO-friendly query based on the inputs in plain text format without any markdown)""" |
|
else: |
|
template = \ |
|
"""Objective: |
|
Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query. |
|
The generated SEO query should enhance visibility, relevance, and ranking on search engines. |
|
|
|
Information: |
|
The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices. |
|
|
|
Instructions: |
|
1. Understand the Input: |
|
- User Query: This is the current question or statement provided by the user. |
|
2. Analyze the User Intent: |
|
- Determine what the user is seeking to find or achieve with their query. |
|
- Identify keywords and phrases that are central to the user's intent. |
|
3. Incorporate SEO Best Practices: |
|
- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords. |
|
- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely. |
|
- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs. |
|
- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation). |
|
- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing. |
|
4. Generate the SEO-Optimized Query: |
|
- Utilize the insights from the user query. |
|
- Formulate a search query that maximizes SEO potential while staying true to the user's intent. |
|
5. Review and Refine: |
|
- Ensure the generated query is free from grammatical errors. |
|
- Verify that the query does not include unnecessary or irrelevant keywords. |
|
- Confirm that the query is tailored to improve search engine rankings for the intended content. |
|
6. Format [IMPORTANT]: |
|
- If the user query is a question, the SEO-optimized query should also be a question. |
|
- If the user query is a statement, the SEO-optimized query should be a clear and concise statement. |
|
- Unless search results would be more accurate if the optimized query was a question. |
|
|
|
Example 1: |
|
- User Query: |
|
'Best vegan restaurants in New York' |
|
|
|
- SEO-Optimized Search Query: |
|
'Top Vegan Restaurants in New York City for Healthy Dining' |
|
|
|
Example 2: |
|
- User Query: |
|
'Give me a list of the best sci-fi movies' |
|
|
|
- SEO-Optimized Search Query: |
|
'What are the top science fiction movies to watch?' |
|
|
|
Input: |
|
- User Query: |
|
{user_query} |
|
|
|
Output: |
|
(The generated SEO-friendly query based on the input in plain text format without any markdown)""" |
|
|
|
prompt_template = ChatPromptTemplate.from_template(template) |
|
prompt = prompt_template.format(context=context, user_query=user_query) |
|
|
|
optimized_query = await llm.ainvoke(prompt) |
|
return optimized_query.content.strip() |
|
|
|
async def search( |
|
self, |
|
query: str, |
|
num_results: int = 10, |
|
gl: str = 'us', |
|
hl: str = 'en', |
|
safe: str = 'off', |
|
exclude_filetypes: Optional[List[str]] = None |
|
) -> List[Dict[str, Any]]: |
|
|
|
exclusion = '' |
|
if exclude_filetypes: |
|
exclusion = ' ' + ' '.join([f"NOT filetype:{ft}" for ft in exclude_filetypes]) |
|
|
|
modified_query = f"{query}{exclusion}" |
|
print(f"Performing search with query: '{modified_query}', num_results: {num_results}, gl: {gl}, hl: {hl}, safe: {safe}") |
|
|
|
try: |
|
all_results = [] |
|
remaining_results = num_results |
|
offset = 0 |
|
|
|
while remaining_results > 0 and offset <= 9: |
|
|
|
count = min(remaining_results, 20) |
|
|
|
|
|
brave_search = BraveSearch.from_api_key( |
|
api_key=self.brave_api_key, |
|
search_kwargs={ |
|
"count": count, |
|
"offset": offset, |
|
"country": gl, |
|
"search_lang": hl, |
|
"safesearch": safe |
|
} |
|
) |
|
try: |
|
results_str = await asyncio.to_thread(brave_search.run, modified_query) |
|
page_results = eval(results_str) |
|
|
|
if not page_results: |
|
break |
|
except Exception as e: |
|
if "429" in str(e): |
|
print("Brave API rate limit hit, waiting 1 second...") |
|
await asyncio.sleep(1) |
|
continue |
|
else: |
|
raise e |
|
|
|
all_results.extend(page_results) |
|
remaining_results -= len(page_results) |
|
offset += 1 |
|
|
|
|
|
await asyncio.sleep(1) |
|
|
|
print(f"Total results fetched: {len(all_results)}") |
|
return all_results[:num_results] |
|
|
|
except Exception as e: |
|
raise e |
|
|
|
@with_api_manager() |
|
async def filter_urls( |
|
self, |
|
query: str, |
|
category: str, |
|
search_results: List[Dict[str, Any]], |
|
num_results: int = 3, |
|
*, |
|
llm |
|
) -> List[Dict[str, str]]: |
|
link_info = {} |
|
for result in search_results: |
|
link = result.get("link") |
|
title = result.get("title") |
|
snippet = result.get("snippet") |
|
|
|
if link and title and snippet: |
|
link_info[link] = {"title": title, "snippet": snippet} |
|
|
|
template = \ |
|
"""[IMPORTANT] |
|
This is a very important task. |
|
Please take a deep breath, read the instructions VERY carefully, and think step-by-step before responding. |
|
|
|
[PROMPT] |
|
You are an expert at determining the relevance of search results to a given query. |
|
Your task is to re-rank the given search results based on their relevance to the original query. |
|
Use a hybrid of semantic and keyword matching to determine relevance |
|
|
|
Consider factors such as: |
|
1. How well the title and snippet match the query intent |
|
3. The credibility and authority of the source |
|
4. The recency of the information (if applicable) |
|
|
|
Rules: |
|
1. Rerank the URLs based on their relevance to the query according to the criteria listed above, from best match to worst match. |
|
2. Once reranked, select the top best matched results according to the category of the query as defined below: |
|
- Advanced: Select upto 3 top best matched results |
|
- Pro: Select upto 4 top best matched results |
|
- Super: Select upto 5 top best matched results |
|
- Ultra: Select upto 6 top best matched results |
|
3. [IMPORTANT] Select the MINIMUM number of results (based on the categories above) that are required to answer the query. |
|
4. The response should only contain a JSON array of objects, each containing 'link', 'title' and 'snippet' keys after reranking and filtering. |
|
|
|
Note: Do not include ANY markdown in your response. |
|
|
|
[INPUT] |
|
Query Category: |
|
{category} |
|
|
|
Query: |
|
{query} |
|
|
|
Dictionary Containing Link, Titles and Snippets: |
|
{link_info} |
|
|
|
Ranked URLs (JSON array of objects):""" |
|
prompt = ChatPromptTemplate.from_template(template) |
|
response = await llm.ainvoke(prompt.format_messages(category=category, query=query, link_info=link_info)) |
|
cleaned_response = remove_markdown(response.content.strip()) |
|
|
|
try: |
|
ranked_links = json.loads(cleaned_response) |
|
print(f"Number of search results after reranking and filtering: {len(ranked_links)}") |
|
return ranked_links |
|
except json.JSONDecodeError: |
|
print("Error decoding JSON response from LLM") |
|
return [{"link": link, "title": info["title"], "snippet": info["snippet"]} for link, info in list(link_info.items())[:num_results]] |
|
|
|
if __name__ == "__main__": |
|
|
|
project_root = Path(__file__).resolve().parents[2] |
|
|
|
|
|
load_dotenv() |
|
|
|
required_env_vars = ["BRAVE_API_KEY"] |
|
missing_vars = [var for var in required_env_vars if os.getenv(var) is None] |
|
if missing_vars: |
|
print(f"Environment variables are not set: {missing_vars}") |
|
exit() |
|
else: |
|
print("All environment variables are set!") |
|
search_engine = SearchEngine() |
|
|
|
queries = [ |
|
"Compare the benefits and drawbacks of AI in healthcare", |
|
"What is the impact of AI on healthcare?", |
|
"How is AI used in healthcare?", |
|
"What are the ethical considerations of AI in healthcare?", |
|
"What are the economic and social impacts of artificial intelligence on the job market?", |
|
"How can cold fusion be achieved without violating the laws of thermodynamics? And how can AGI help with that?", |
|
"What are the major obstacles to achieving carbon neutrality in heavy industries like steel and cement? What are the potential solutions?" |
|
] |
|
|
|
async def main(queries: List[str]): |
|
for query in queries: |
|
optimized_query = await search_engine.generate_optimized_query(query) |
|
print(f"\nOriginal Query: {query}") |
|
print(f"Optimized Query: {optimized_query}\n") |
|
|
|
start = time.perf_counter() |
|
search_results = await search_engine.search(optimized_query, num_results=2, exclude_filetypes=["pdf"]) |
|
end = time.perf_counter() |
|
print(f"Time taken to fetch search results: {end - start:.2f} seconds") |
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Search Results:") |
|
urls = [] |
|
for result in search_results: |
|
print(f"- {result['title']}: {result['link']}: {result['snippet']}") |
|
urls.append(result['link']) |
|
|
|
print("-"*20) |
|
|
|
asyncio.run(main(queries)) |