Spaces:

tony10010
/

cohex

Paused

cohex / src /search /search_engine.py

Hemang Thakur

deploy

d5c104e 9 days ago

14.5 kB

	import os
	import json
	import asyncio
	from typing import List, Dict, Any, Optional
	from langchain.prompts import ChatPromptTemplate
	from pathlib import Path
	from dotenv import load_dotenv
	import time
	from langchain_community.tools import BraveSearch
	from src.utils.api_key_manager import with_api_manager
	from src.helpers.helper import remove_markdown

	class SearchEngine:
	def __init__(
	self,
	brave_api_key: Optional[str] = None,
	):
	if brave_api_key is None:
	if os.getenv("BRAVE_API_KEY") is None:
	raise ValueError("BRAVE_API_KEY is not set")
	else:
	self.brave_api_key = os.getenv("BRAVE_API_KEY")
	else:
	self.brave_api_key = brave_api_key

	@with_api_manager()
	async def generate_optimized_query(self, user_query: str, context: str = None, *, llm) -> str:
	if context:
	template = \
	"""Objective:
	Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query and relevant past context.
	The generated SEO query should enhance visibility, relevance, and ranking on search engines.

	Information:
	The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices.

	Instructions:
	1. Understand the Inputs:
	- User Query: This is the current question or statement provided by the user.
	- Past Context: This includes any relevant previous interactions, preferences, or information that can inform the understanding of the user's intent.
	2. Analyze the User Intent:
	- Determine what the user is seeking to find or achieve with their query.
	- Identify keywords and phrases that are central to the user's intent.
	3. Incorporate SEO Best Practices:
	- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords.
	- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely.
	- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs.
	- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation).
	- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing.
	4. Generate the SEO-Optimized Query:
	- Combine the insights from the user query and past context.
	- Formulate a search query that maximizes SEO potential while staying true to the user's intent.
	5. Review and Refine:
	- Ensure the generated query is free from grammatical errors.
	- Verify that the query does not include unnecessary or irrelevant keywords.
	- Confirm that the query is tailored to improve search engine rankings for the intended content.
	6. Format [IMPORTANT]:
	- If the user query is a question, the SEO-optimized query should also be a question.
	- If the user query is a statement, the SEO-optimized query should be a clear and concise statement.
	- Unless search results would be more accurate if the optimized query was a question.

	Example 1:
	- User Query:
	'Best vegan restaurants in New York'
	- Past Context:
	'User has previously shown interest in healthy eating and sustainability.'
	- SEO-Optimized Search Query:
	'Top Vegan Restaurants in New York City for Healthy Dining'

	Example 2:
	- User Query:
	'Give me a list of the best sci-fi movies'
	- Past Context:
	'User has a preference for classic science fiction films. Previous searches include "Blade Runner" and "2001: A Space Odyssey."'
	- SEO-Optimized Search Query:
	'What are the top classic science fiction movies to watch that are similar to Blade Runner and 2001: A Space Odyssey?'

	Input:
	- User Query:
	{user_query}

	- Past Context:
	{context}

	Output:
	(The generated SEO-friendly query based on the inputs in plain text format without any markdown)"""
	else:
	template = \
	"""Objective:
	Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query.
	The generated SEO query should enhance visibility, relevance, and ranking on search engines.

	Information:
	The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices.

	Instructions:
	1. Understand the Input:
	- User Query: This is the current question or statement provided by the user.
	2. Analyze the User Intent:
	- Determine what the user is seeking to find or achieve with their query.
	- Identify keywords and phrases that are central to the user's intent.
	3. Incorporate SEO Best Practices:
	- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords.
	- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely.
	- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs.
	- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation).
	- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing.
	4. Generate the SEO-Optimized Query:
	- Utilize the insights from the user query.
	- Formulate a search query that maximizes SEO potential while staying true to the user's intent.
	5. Review and Refine:
	- Ensure the generated query is free from grammatical errors.
	- Verify that the query does not include unnecessary or irrelevant keywords.
	- Confirm that the query is tailored to improve search engine rankings for the intended content.
	6. Format [IMPORTANT]:
	- If the user query is a question, the SEO-optimized query should also be a question.
	- If the user query is a statement, the SEO-optimized query should be a clear and concise statement.
	- Unless search results would be more accurate if the optimized query was a question.

	Example 1:
	- User Query:
	'Best vegan restaurants in New York'

	- SEO-Optimized Search Query:
	'Top Vegan Restaurants in New York City for Healthy Dining'

	Example 2:
	- User Query:
	'Give me a list of the best sci-fi movies'

	- SEO-Optimized Search Query:
	'What are the top science fiction movies to watch?'

	Input:
	- User Query:
	{user_query}

	Output:
	(The generated SEO-friendly query based on the input in plain text format without any markdown)"""

	prompt_template = ChatPromptTemplate.from_template(template)
	prompt = prompt_template.format(context=context, user_query=user_query)

	optimized_query = await llm.ainvoke(prompt)
	return optimized_query.content.strip()

	async def search(
	self,
	query: str,
	num_results: int = 10,
	gl: str = 'us',
	hl: str = 'en',
	safe: str = 'off',
	exclude_filetypes: Optional[List[str]] = None
	) -> List[Dict[str, Any]]:
	# Construct exclusion string for filetypes (maintaining compatibility)
	exclusion = ''
	if exclude_filetypes:
	exclusion = ' ' + ' '.join([f"NOT filetype:{ft}" for ft in exclude_filetypes])

	modified_query = f"{query}{exclusion}"
	print(f"Performing search with query: '{modified_query}', num_results: {num_results}, gl: {gl}, hl: {hl}, safe: {safe}")

	try:
	all_results = []
	remaining_results = num_results
	offset = 0

	while remaining_results > 0 and offset <= 9: # Max offset is 9
	# Calculate count for this page (max 20 per request)
	count = min(remaining_results, 20)

	# Initialize Brave Search within the method
	brave_search = BraveSearch.from_api_key(
	api_key=self.brave_api_key,
	search_kwargs={
	"count": count,
	"offset": offset,
	"country": gl,
	"search_lang": hl,
	"safesearch": safe
	}
	)
	try:
	results_str = await asyncio.to_thread(brave_search.run, modified_query)
	page_results = eval(results_str) # Convert string representation of list to actual list

	if not page_results: # No more results available
	break
	except Exception as e:
	if "429" in str(e): # Rate limit error
	print("Brave API rate limit hit, waiting 1 second...")
	await asyncio.sleep(1)
	continue
	else:
	raise e

	all_results.extend(page_results)
	remaining_results -= len(page_results)
	offset += 1

	# Add a delay to avoid hitting the rate limit
	await asyncio.sleep(1)

	print(f"Total results fetched: {len(all_results)}")
	return all_results[:num_results] # Ensure we don't return more than requested

	except Exception as e:
	raise e

	@with_api_manager()
	async def filter_urls(
	self,
	query: str,
	category: str,
	search_results: List[Dict[str, Any]],
	num_results: int = 3,
	*,
	llm
	) -> List[Dict[str, str]]:
	link_info = {}
	for result in search_results:
	link = result.get("link")
	title = result.get("title")
	snippet = result.get("snippet")

	if link and title and snippet:
	link_info[link] = {"title": title, "snippet": snippet}

	template = \
	"""[IMPORTANT]
	This is a very important task.
	Please take a deep breath, read the instructions VERY carefully, and think step-by-step before responding.

	[PROMPT]
	You are an expert at determining the relevance of search results to a given query.
	Your task is to re-rank the given search results based on their relevance to the original query.
	Use a hybrid of semantic and keyword matching to determine relevance

	Consider factors such as:
	1. How well the title and snippet match the query intent
	3. The credibility and authority of the source
	4. The recency of the information (if applicable)

	Rules:
	1. Rerank the URLs based on their relevance to the query according to the criteria listed above, from best match to worst match.
	2. Once reranked, select the top best matched results according to the category of the query as defined below:
	- Advanced: Select upto 3 top best matched results
	- Pro: Select upto 4 top best matched results
	- Super: Select upto 5 top best matched results
	- Ultra: Select upto 6 top best matched results
	3. [IMPORTANT] Select the MINIMUM number of results (based on the categories above) that are required to answer the query.
	4. The response should only contain a JSON array of objects, each containing 'link', 'title' and 'snippet' keys after reranking and filtering.

	Note: Do not include ANY markdown in your response.

	[INPUT]
	Query Category:
	{category}

	Query:
	{query}

	Dictionary Containing Link, Titles and Snippets:
	{link_info}

	Ranked URLs (JSON array of objects):"""
	prompt = ChatPromptTemplate.from_template(template)
	response = await llm.ainvoke(prompt.format_messages(category=category, query=query, link_info=link_info))
	cleaned_response = remove_markdown(response.content.strip())

	try:
	ranked_links = json.loads(cleaned_response)
	print(f"Number of search results after reranking and filtering: {len(ranked_links)}")
	return ranked_links
	except json.JSONDecodeError:
	print("Error decoding JSON response from LLM")
	return [{"link": link, "title": info["title"], "snippet": info["snippet"]} for link, info in list(link_info.items())[:num_results]]

	if __name__ == "__main__":
	# Get the project root directory
	project_root = Path(__file__).resolve().parents[2]

	# Load environment variables
	load_dotenv()

	required_env_vars = ["BRAVE_API_KEY"]
	missing_vars = [var for var in required_env_vars if os.getenv(var) is None]
	if missing_vars:
	print(f"Environment variables are not set: {missing_vars}")
	exit()
	else:
	print("All environment variables are set!")
	search_engine = SearchEngine()

	queries = [
	"Compare the benefits and drawbacks of AI in healthcare",
	"What is the impact of AI on healthcare?",
	"How is AI used in healthcare?",
	"What are the ethical considerations of AI in healthcare?",
	"What are the economic and social impacts of artificial intelligence on the job market?",
	"How can cold fusion be achieved without violating the laws of thermodynamics? And how can AGI help with that?",
	"What are the major obstacles to achieving carbon neutrality in heavy industries like steel and cement? What are the potential solutions?"
	]

	async def main(queries: List[str]):
	for query in queries:
	optimized_query = await search_engine.generate_optimized_query(query)
	print(f"\nOriginal Query: {query}")
	print(f"Optimized Query: {optimized_query}\n")

	start = time.perf_counter()
	search_results = await search_engine.search(optimized_query, num_results=2, exclude_filetypes=["pdf"])
	end = time.perf_counter()
	print(f"Time taken to fetch search results: {end - start:.2f} seconds")
	# filtered_search = search_engine.filter_urls(
	# optimized_query,
	# category="Advanced",
	# search_results=search_results,
	# num_results=2
	# )
	print("Search Results:")
	urls = []
	for result in search_results:
	print(f"- {result['title']}: {result['link']}: {result['snippet']}")
	urls.append(result['link'])

	print("-"*20)

	asyncio.run(main(queries))