Spaces:

tony10010
/

cohex

Paused

Hemang Thakur

deploy

d5c104e 9 days ago

17.7 kB

	from datetime import datetime, timezone
	from langchain.prompts import ChatPromptTemplate
	from langchain_core.prompts import ChatPromptTemplate
	from src.utils.api_key_manager import APIKeyManager, with_api_manager
	from src.query_processing.late_chunking.late_chunker import LateChunker

	class Reasoner:
	def __init__(self):
	self.manager = APIKeyManager()
	self.model = self.manager.get_llm()

	@with_api_manager(streaming=True)
	async def answer(
	self,
	query,
	context=None,
	query_type="general",
	*,
	llm
	):
	if context is None:
	template = \
	"""You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers.
	You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses.

	Your task is to provide answers that are:
	- Informative and relevant: Thoroughly address the user's query.
	- Well-structured: Include clear headings and subheadings, and use a professional tone to present information concisely and logically.
	- Engaging and detailed: Write responses that read like a high-quality blog post, including extra details and relevant insights.
	- Explanatory and Comprehensive: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable.

	### Formatting Instructions
	- Structure: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2").
	Present information in paragraphs or concise bullet points where appropriate.
	- Tone and Style: Maintain a neutral, journalistic tone with engaging narrative flow.
	Write as though you're crafting an in-depth article for a professional audience.
	- Markdown Usage: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability.
	- Length and Depth: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition.
	Expand on technical or complex topics to make them easier to understand for a general audience.
	- No main heading/title: Start your response directly with the introduction unless asked to provide a specific title.
	- Conclusion or Summary: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate.

	### Special Instructions
	- If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity.
	- If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search.
	- If no relevant information is found, say:
	"Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?"
	Be transparent about limitations and suggest alternatives or ways to reframe the query.

	### User instructions
	- These instructions are shared to you by the user as part of the query itself.
	- You will have to follow them and give them higher priority than the above instructions.
	- If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines.
	- If no instructions are provided, follow the general guidelines and instructions above.

	### Example Output
	- Begin with a brief introduction summarizing the event or query topic.
	- Follow with detailed sections under clear headings, covering all aspects of the query if possible.
	- Provide explanations or historical context as needed to enhance understanding.
	- End with a conclusion or overall perspective if relevant.

	Query:
	{query}

	Current date & time in ISO format (UTC timezone): {date}"""

	prompt = ChatPromptTemplate.from_template(template)
	messages = prompt.format_messages(query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'))

	elif query_type == "basic" and "[USER PROVIDED" in context:
	template = \
	"""You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers.
	You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses.

	Your task is to provide answers that are:
	- Informative and relevant: Thoroughly address the user's query.
	- Well-structured: Include clear headings and subheadings, and use a professional tone to present information concisely and logically.
	- Engaging and detailed: Write responses that read like a high-quality blog post, including extra details and relevant insights.
	- Explanatory and Comprehensive: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable.

	### Formatting Instructions
	- Structure: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2").
	Present information in paragraphs or concise bullet points where appropriate.
	- Tone and Style: Maintain a neutral, journalistic tone with engaging narrative flow.
	Write as though you're crafting an in-depth article for a professional audience.
	- Markdown Usage: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability.
	- Length and Depth: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition.
	Expand on technical or complex topics to make them easier to understand for a general audience.
	- No main heading/title: Start your response directly with the introduction unless asked to provide a specific title.
	- Conclusion or Summary: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate.

	### Special Instructions
	- If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity.
	- If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search.
	- All user-provided files and/or links must be given higher priority to those sources when crafting the response.
	- If no relevant information is found, say:
	"Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?"
	Be transparent about limitations and suggest alternatives or ways to reframe the query.

	### User instructions
	- These instructions are shared to you by the user as part of the query itself.
	- You will have to follow them and give them higher priority than the above instructions.
	- If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines.
	- If no instructions are provided, follow the general guidelines and instructions above.

	### Example Output
	- Begin with a brief introduction summarizing the event or query topic.
	- Follow with detailed sections under clear headings, covering all aspects of the query if possible.
	- Provide explanations or historical context as needed to enhance understanding.
	- End with a conclusion or overall perspective if relevant.

	Context:
	{context}

	Query:
	{query}

	Current date & time in ISO format (UTC timezone): {date}"""

	prompt = ChatPromptTemplate.from_template(template)
	messages = prompt.format_messages(context=context, query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'))

	else:
	template = \
	"""You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers.
	You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses.

	Your task is to provide answers that are:
	- Informative and relevant: Thoroughly address the user's query using the given context.
	- Well-structured: Include clear headings and subheadings, and use a professional tone to present information concisely and logically.
	- Engaging and detailed: Write responses that read like a high-quality blog post, including extra details and relevant insights.
	- Cited and credible: Use inline citations with [number] notation to refer to the context source(s) for each fact or detail included.
	- Explanatory and Comprehensive: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable.

	### Formatting Instructions
	- Structure: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2").
	Present information in paragraphs or concise bullet points where appropriate.
	- Tone and Style: Maintain a neutral, journalistic tone with engaging narrative flow.
	Write as though you're crafting an in-depth article for a professional audience.
	- Markdown Usage: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability.
	- Length and Depth: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition.
	Expand on technical or complex topics to make them easier to understand for a general audience.
	- No main heading/title: Start your response directly with the introduction unless asked to provide a specific title.
	- Conclusion or Summary: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate.

	### [IMPORTANT] Citation Requirements
	- Cite every single fact, statement, or sentence using [number] notation corresponding to the source from the provided `context`.
	Each source in the `context` will be in the following format, where N is the source number:-
	[SOURCE N START]
	source content...
	[SOURCE N END]
	- Integrate citations naturally at the end of sentences or clauses as appropriate.
	For example, "The Eiffel Tower is one of the most visited landmarks in the world[1]."
	- [IMPORTANT] If applicable, use multiple sources for a single detail, such as, "Paris is a cultural hub, attracting millions of visitors annually[1][2]."
	DO NOT use two numbers in the same citation marker, e.g., [1,2] is NOT valid.
	- Always prioritize credibility and accuracy by linking all statements back to their respective context sources.
	- Avoid citing unsupported assumptions or personal interpretations; if no source supports a statement, clearly indicate the limitation.

	### Special Instructions
	- If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity.
	- If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search.
	- If the context contains any user-provided files and/or links, ensure to give higher priority to those sources when crafting the response.
	- If no relevant information is found, say:
	"Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?"
	Be transparent about limitations and suggest alternatives or ways to reframe the query.

	### User instructions
	- These instructions are shared to you by the user as part of the query itself.
	- You will have to follow them and give them higher priority than the above instructions.
	- If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines.
	- If no instructions are provided, follow the general guidelines and instructions above.

	### Example Output
	- Begin with a brief introduction summarizing the event or query topic.
	- Follow with detailed sections under clear headings, covering all aspects of the query if possible.
	- Provide explanations or historical context as needed to enhance understanding.
	- End with a conclusion or overall perspective if relevant.

	Context:
	{context}

	Query:
	{query}

	Current date & time in ISO format (UTC timezone): {date}"""

	prompt = ChatPromptTemplate.from_template(template)
	messages = prompt.format_messages(context=context, query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'))

	try:
	async for chunk in llm.astream(messages):
	yield chunk.content
	except Exception as e:
	raise e

	@with_api_manager()
	async def summarize(
	self,
	query,
	content,
	model_name="minishlab/potion-base-8M",
	max_chunk_length=1000,
	max_tokens_allowed=None,
	overlap=200,
	*,
	llm
	):
	if max_tokens_allowed:
	late_chunker = LateChunker(model_name=model_name)
	content_tokens = self.model.get_num_tokens(content)

	if content_tokens > max_tokens_allowed:
	print("Content is too long, applying late chunking...")
	content = await late_chunker.chunker(
	text=content,
	query=query,
	max_chunk_length=max_chunk_length,
	max_tokens=max_tokens_allowed,
	overlap=overlap
	)

	template= \
	"""You are an expert at summarizing long documents.
	Your task is to create a concise but detailed summary of documents that ultimately lead to detailed and precise answers to the queries.

	Rules:
	1. The summary should be concise but detailed, precise and accurate.
	2. Focus on extracting key information, facts, and data that are directly relevant to the query.
	3. Include specific details, numbers, and quotes when they are important.
	4. Ensure that your summary preserves the original meaning and context of the information.

	Your response should ONLY be the detailed summary of documents in plain text without any formatting.

	Query:
	{query}

	Document:
	{content}"""
	prompt = ChatPromptTemplate.from_template(template)
	messages = prompt.format_messages(content=content, query=query)
	response = await llm.ainvoke(messages)
	return response.content.strip()

	@with_api_manager()
	async def get_excerpts(
	self,
	answer_text,
	source_docs,
	*,
	llm
	):
	template= \
	"""You are an expert at generating excerpts from long documents.
	Your task is to find and extract the most relevant, contiguous sentence(s) or short passage from the Source Documents that directly supports the Answer Text.

	The Source Documents are formatted with markers like [SOURCE N START] and [SOURCE N END], where N is the source number.
	The Answer Text uses citation markers like [N], where N directly corresponds to the source number N in the Source Documents.
	In case of multiple citations, the Answer Text's citation markers will be like [N][M][...etc] (or in some cases, [N, M, ...etc]).

	[IMPORTANT] Rules:
	1. You must carefully read and analyse the Answer Text and the Source Documents.
	2. The excerpts should be concise but detailed, precise and accurate.
	3. Focus on extracting key information, facts, and data that are directly relevant to the answer.
	4. Include specific details, numbers, and quotes when they are important.
	5. Ensure the excerpts are verbatim and extracted directly from the context without any paraphrasing or alteration.
	6. Your output should be a valid python list as shown in the output format below.
	7. If you cannot find any relevant excerpts, say "Excerpt not found".

	Output Format:
	[
	{{<statement 1>: {{<source number>: <extracted excerpt 1>,
	<source number>: <extracted excerpt 2>,
	and so on...}}
	}},
	{{<statement 2>: {{<source number>: <extracted excerpt 1>,
	<source number>: <extracted excerpt 2>,
	and so on...}}
	}},
	...and so on
	]

	Example Output:
	[
	{{"The Treaty of Waitangi is a foundational document in New Zealand's history.": {{
	1: "The Treaty of Waitangi, signed in 1840, is considered the founding document of New Zealand."
	}}
	}},
	{{"Signed in 1840, the principles of the Treaty are often debated.": {{
	1: "The Treaty of Waitangi, signed in 1840, is considered the founding document of New Zealand.",
	2: "The principles of the Treaty are often debated in legal and political contexts."
	}}
	}},
	{{"The Treaty can arguably lead to a civil war in New Zealand.": {{
	"NA": "Excerpt not found"
	}}
	}}
	]

	Source Documents:
	{source_docs}

	Answer Text:
	{answer_text}"""

	prompt = ChatPromptTemplate.from_template(template)
	messages = prompt.format_messages(answer_text=answer_text, source_docs=source_docs)
	response = await llm.ainvoke(messages)
	return response.content.strip()

	if __name__ == "__main__":
	import asyncio
	from src.crawl.crawler import Crawler

	reasoner = Reasoner()
	crawler = Crawler()

	session_id = crawler.create_session()
	contents = asyncio.run(crawler.crawl_with_retry(
	"https://www.parliament.nz/en/pb/sc/make-a-submission/document/54SCJUST_SCF_227E6D0B-E632-42EB-CFFE-08DCFEB826C6/principles-of-the-treaty-of-waitangi-bill",
	session_id=session_id,
	rotate_proxy=False,
	return_html=True
	))
	print(contents)