|
from datetime import datetime, timezone |
|
from langchain.prompts import ChatPromptTemplate |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from src.utils.api_key_manager import APIKeyManager, with_api_manager |
|
from src.query_processing.late_chunking.late_chunker import LateChunker |
|
|
|
class Reasoner: |
|
def __init__(self): |
|
self.manager = APIKeyManager() |
|
self.model = self.manager.get_llm() |
|
|
|
@with_api_manager(streaming=True) |
|
async def answer( |
|
self, |
|
query, |
|
context=None, |
|
query_type="general", |
|
*, |
|
llm |
|
): |
|
if context is None: |
|
template = \ |
|
"""You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers. |
|
You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses. |
|
|
|
Your task is to provide answers that are: |
|
- **Informative and relevant**: Thoroughly address the user's query. |
|
- **Well-structured**: Include clear headings and subheadings, and use a professional tone to present information concisely and logically. |
|
- **Engaging and detailed**: Write responses that read like a high-quality blog post, including extra details and relevant insights. |
|
- **Explanatory and Comprehensive**: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable. |
|
|
|
### Formatting Instructions |
|
- **Structure**: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2"). |
|
Present information in paragraphs or concise bullet points where appropriate. |
|
- **Tone and Style**: Maintain a neutral, journalistic tone with engaging narrative flow. |
|
Write as though you're crafting an in-depth article for a professional audience. |
|
- **Markdown Usage**: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability. |
|
- **Length and Depth**: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition. |
|
Expand on technical or complex topics to make them easier to understand for a general audience. |
|
- **No main heading/title**: Start your response directly with the introduction unless asked to provide a specific title. |
|
- **Conclusion or Summary**: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate. |
|
|
|
### Special Instructions |
|
- If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity. |
|
- If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search. |
|
- If no relevant information is found, say: |
|
"Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?" |
|
Be transparent about limitations and suggest alternatives or ways to reframe the query. |
|
|
|
### User instructions |
|
- These instructions are shared to you by the user as part of the query itself. |
|
- You will have to follow them and give them higher priority than the above instructions. |
|
- If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines. |
|
- If no instructions are provided, follow the general guidelines and instructions above. |
|
|
|
### Example Output |
|
- Begin with a brief introduction summarizing the event or query topic. |
|
- Follow with detailed sections under clear headings, covering all aspects of the query if possible. |
|
- Provide explanations or historical context as needed to enhance understanding. |
|
- End with a conclusion or overall perspective if relevant. |
|
|
|
Query: |
|
{query} |
|
|
|
Current date & time in ISO format (UTC timezone): {date}""" |
|
|
|
prompt = ChatPromptTemplate.from_template(template) |
|
messages = prompt.format_messages(query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')) |
|
|
|
elif query_type == "basic" and "[USER PROVIDED" in context: |
|
template = \ |
|
"""You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers. |
|
You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses. |
|
|
|
Your task is to provide answers that are: |
|
- **Informative and relevant**: Thoroughly address the user's query. |
|
- **Well-structured**: Include clear headings and subheadings, and use a professional tone to present information concisely and logically. |
|
- **Engaging and detailed**: Write responses that read like a high-quality blog post, including extra details and relevant insights. |
|
- **Explanatory and Comprehensive**: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable. |
|
|
|
### Formatting Instructions |
|
- **Structure**: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2"). |
|
Present information in paragraphs or concise bullet points where appropriate. |
|
- **Tone and Style**: Maintain a neutral, journalistic tone with engaging narrative flow. |
|
Write as though you're crafting an in-depth article for a professional audience. |
|
- **Markdown Usage**: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability. |
|
- **Length and Depth**: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition. |
|
Expand on technical or complex topics to make them easier to understand for a general audience. |
|
- **No main heading/title**: Start your response directly with the introduction unless asked to provide a specific title. |
|
- **Conclusion or Summary**: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate. |
|
|
|
### Special Instructions |
|
- If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity. |
|
- If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search. |
|
- All user-provided files and/or links must be given higher priority to those sources when crafting the response. |
|
- If no relevant information is found, say: |
|
"Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?" |
|
Be transparent about limitations and suggest alternatives or ways to reframe the query. |
|
|
|
### User instructions |
|
- These instructions are shared to you by the user as part of the query itself. |
|
- You will have to follow them and give them higher priority than the above instructions. |
|
- If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines. |
|
- If no instructions are provided, follow the general guidelines and instructions above. |
|
|
|
### Example Output |
|
- Begin with a brief introduction summarizing the event or query topic. |
|
- Follow with detailed sections under clear headings, covering all aspects of the query if possible. |
|
- Provide explanations or historical context as needed to enhance understanding. |
|
- End with a conclusion or overall perspective if relevant. |
|
|
|
Context: |
|
{context} |
|
|
|
Query: |
|
{query} |
|
|
|
Current date & time in ISO format (UTC timezone): {date}""" |
|
|
|
prompt = ChatPromptTemplate.from_template(template) |
|
messages = prompt.format_messages(context=context, query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')) |
|
|
|
else: |
|
template = \ |
|
"""You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers. |
|
You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses. |
|
|
|
Your task is to provide answers that are: |
|
- **Informative and relevant**: Thoroughly address the user's query using the given context. |
|
- **Well-structured**: Include clear headings and subheadings, and use a professional tone to present information concisely and logically. |
|
- **Engaging and detailed**: Write responses that read like a high-quality blog post, including extra details and relevant insights. |
|
- **Cited and credible**: Use inline citations with [number] notation to refer to the context source(s) for each fact or detail included. |
|
- **Explanatory and Comprehensive**: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable. |
|
|
|
### Formatting Instructions |
|
- **Structure**: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2"). |
|
Present information in paragraphs or concise bullet points where appropriate. |
|
- **Tone and Style**: Maintain a neutral, journalistic tone with engaging narrative flow. |
|
Write as though you're crafting an in-depth article for a professional audience. |
|
- **Markdown Usage**: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability. |
|
- **Length and Depth**: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition. |
|
Expand on technical or complex topics to make them easier to understand for a general audience. |
|
- **No main heading/title**: Start your response directly with the introduction unless asked to provide a specific title. |
|
- **Conclusion or Summary**: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate. |
|
|
|
### [IMPORTANT] Citation Requirements |
|
- Cite every single fact, statement, or sentence using [number] notation corresponding to the source from the provided `context`. |
|
Each source in the `context` will be in the following format, where N is the source number:- |
|
[SOURCE N START] |
|
source content... |
|
[SOURCE N END] |
|
- Integrate citations naturally at the end of sentences or clauses as appropriate. |
|
For example, "The Eiffel Tower is one of the most visited landmarks in the world[1]." |
|
- [IMPORTANT] If applicable, use multiple sources for a single detail, such as, "Paris is a cultural hub, attracting millions of visitors annually[1][2]." |
|
*DO NOT* use two numbers in the same citation marker, e.g., [1,2] is *NOT* valid. |
|
- Always prioritize credibility and accuracy by linking all statements back to their respective context sources. |
|
- Avoid citing unsupported assumptions or personal interpretations; if no source supports a statement, clearly indicate the limitation. |
|
|
|
### Special Instructions |
|
- If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity. |
|
- If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search. |
|
- If the context contains any user-provided files and/or links, ensure to give higher priority to those sources when crafting the response. |
|
- If no relevant information is found, say: |
|
"Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?" |
|
Be transparent about limitations and suggest alternatives or ways to reframe the query. |
|
|
|
### User instructions |
|
- These instructions are shared to you by the user as part of the query itself. |
|
- You will have to follow them and give them higher priority than the above instructions. |
|
- If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines. |
|
- If no instructions are provided, follow the general guidelines and instructions above. |
|
|
|
### Example Output |
|
- Begin with a brief introduction summarizing the event or query topic. |
|
- Follow with detailed sections under clear headings, covering all aspects of the query if possible. |
|
- Provide explanations or historical context as needed to enhance understanding. |
|
- End with a conclusion or overall perspective if relevant. |
|
|
|
Context: |
|
{context} |
|
|
|
Query: |
|
{query} |
|
|
|
Current date & time in ISO format (UTC timezone): {date}""" |
|
|
|
prompt = ChatPromptTemplate.from_template(template) |
|
messages = prompt.format_messages(context=context, query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')) |
|
|
|
try: |
|
async for chunk in llm.astream(messages): |
|
yield chunk.content |
|
except Exception as e: |
|
raise e |
|
|
|
@with_api_manager() |
|
async def summarize( |
|
self, |
|
query, |
|
content, |
|
model_name="minishlab/potion-base-8M", |
|
max_chunk_length=1000, |
|
max_tokens_allowed=None, |
|
overlap=200, |
|
*, |
|
llm |
|
): |
|
if max_tokens_allowed: |
|
late_chunker = LateChunker(model_name=model_name) |
|
content_tokens = self.model.get_num_tokens(content) |
|
|
|
if content_tokens > max_tokens_allowed: |
|
print("Content is too long, applying late chunking...") |
|
content = await late_chunker.chunker( |
|
text=content, |
|
query=query, |
|
max_chunk_length=max_chunk_length, |
|
max_tokens=max_tokens_allowed, |
|
overlap=overlap |
|
) |
|
|
|
template= \ |
|
"""You are an expert at summarizing long documents. |
|
Your task is to create a concise but detailed summary of documents that ultimately lead to detailed and precise answers to the queries. |
|
|
|
Rules: |
|
1. The summary should be concise but detailed, precise and accurate. |
|
2. Focus on extracting key information, facts, and data that are directly relevant to the query. |
|
3. Include specific details, numbers, and quotes when they are important. |
|
4. Ensure that your summary preserves the original meaning and context of the information. |
|
|
|
Your response should ONLY be the detailed summary of documents in plain text without any formatting. |
|
|
|
Query: |
|
{query} |
|
|
|
Document: |
|
{content}""" |
|
prompt = ChatPromptTemplate.from_template(template) |
|
messages = prompt.format_messages(content=content, query=query) |
|
response = await llm.ainvoke(messages) |
|
return response.content.strip() |
|
|
|
@with_api_manager() |
|
async def get_excerpts( |
|
self, |
|
answer_text, |
|
source_docs, |
|
*, |
|
llm |
|
): |
|
template= \ |
|
"""You are an expert at generating excerpts from long documents. |
|
Your task is to find and extract the most relevant, contiguous sentence(s) or short passage from the Source Documents that directly supports the Answer Text. |
|
|
|
The Source Documents are formatted with markers like [SOURCE N START] and [SOURCE N END], where N is the source number. |
|
The Answer Text uses citation markers like [N], where N directly corresponds to the source number N in the Source Documents. |
|
In case of multiple citations, the Answer Text's citation markers will be like [N][M][...etc] (or in some cases, [N, M, ...etc]). |
|
|
|
[IMPORTANT] Rules: |
|
1. You must carefully read and analyse the Answer Text and the Source Documents. |
|
2. The excerpts should be concise but detailed, precise and accurate. |
|
3. Focus on extracting key information, facts, and data that are directly relevant to the answer. |
|
4. Include specific details, numbers, and quotes when they are important. |
|
5. Ensure the excerpts are verbatim and extracted directly from the context without any paraphrasing or alteration. |
|
6. Your output should be a valid python list as shown in the output format below. |
|
7. If you cannot find any relevant excerpts, say "Excerpt not found". |
|
|
|
Output Format: |
|
[ |
|
{{<statement 1>: {{<source number>: <extracted excerpt 1>, |
|
<source number>: <extracted excerpt 2>, |
|
and so on...}} |
|
}}, |
|
{{<statement 2>: {{<source number>: <extracted excerpt 1>, |
|
<source number>: <extracted excerpt 2>, |
|
and so on...}} |
|
}}, |
|
...and so on |
|
] |
|
|
|
Example Output: |
|
[ |
|
{{"The Treaty of Waitangi is a foundational document in New Zealand's history.": {{ |
|
1: "The Treaty of Waitangi, signed in 1840, is considered the founding document of New Zealand." |
|
}} |
|
}}, |
|
{{"Signed in 1840, the principles of the Treaty are often debated.": {{ |
|
1: "The Treaty of Waitangi, signed in 1840, is considered the founding document of New Zealand.", |
|
2: "The principles of the Treaty are often debated in legal and political contexts." |
|
}} |
|
}}, |
|
{{"The Treaty can arguably lead to a civil war in New Zealand.": {{ |
|
"NA": "Excerpt not found" |
|
}} |
|
}} |
|
] |
|
|
|
Source Documents: |
|
{source_docs} |
|
|
|
Answer Text: |
|
{answer_text}""" |
|
|
|
prompt = ChatPromptTemplate.from_template(template) |
|
messages = prompt.format_messages(answer_text=answer_text, source_docs=source_docs) |
|
response = await llm.ainvoke(messages) |
|
return response.content.strip() |
|
|
|
if __name__ == "__main__": |
|
import asyncio |
|
from src.crawl.crawler import Crawler |
|
|
|
reasoner = Reasoner() |
|
crawler = Crawler() |
|
|
|
session_id = crawler.create_session() |
|
contents = asyncio.run(crawler.crawl_with_retry( |
|
"https://www.parliament.nz/en/pb/sc/make-a-submission/document/54SCJUST_SCF_227E6D0B-E632-42EB-CFFE-08DCFEB826C6/principles-of-the-treaty-of-waitangi-bill", |
|
session_id=session_id, |
|
rotate_proxy=False, |
|
return_html=True |
|
)) |
|
print(contents) |
|
|
|
|