Spaces:
Sleeping
Sleeping
from queue import SimpleQueue | |
from dotenv import load_dotenv | |
import re | |
from langchain.callbacks.base import BaseCallbackHandler | |
job_done = object() # signals the processing is done | |
class StreamingGradioCallbackHandler(BaseCallbackHandler): | |
"""Callback handler for streaming. Only works with LLMs that support streaming.""" | |
def __init__(self, q): | |
self.q = q | |
def on_llm_start(self, serialized, prompts, **kwargs) -> None: | |
"""Run when LLM starts running.""" | |
while not self.q.empty(): | |
try: | |
self.q.get(block=False) | |
except SimpleQueue.empty: | |
continue | |
def on_llm_new_token(self, token, **kwargs) -> None: | |
"""Run on new LLM token. Only available when streaming is enabled.""" | |
self.q.put(token) | |
def on_llm_end(self, response, **kwargs) -> None: | |
"""Run when LLM ends running.""" | |
self.q.put(job_done) | |
def on_llm_error(self, error, **kwargs) -> None: | |
"""Run when LLM errors.""" | |
self.q.put(job_done) | |
def add_gradio_streaming(llm): | |
q = SimpleQueue() | |
job_done = object() # signals the processing is done | |
llm.callbacks = [StreamingGradioCallbackHandler(q)] | |
return llm, q | |
def gradio_stream(llm, prompt): | |
thread = Thread(target=llm.predict, kwargs={"text": prompt}) | |
thread.start() | |
text = "" | |
while True: | |
next_token = q.get(block=True) # Blocks until an input is available | |
if next_token is job_done: | |
break | |
text += next_token | |
time.sleep(0.03) | |
yield text | |
thread.join() | |
def get_source_link(metadata): | |
return metadata["file_url"] + f"#page={metadata['content_page_number'] + 1}" | |
def make_html_source(source, i, score, config): | |
meta = source.metadata | |
if meta["file_source_type"] == "AFP": | |
return f""" | |
<div class="card" id="doc{i}"> | |
<div class="card-content"> | |
<h2>Doc {i} - {meta['file_title']} - {meta['file_type']} AFP</h2> | |
<p>{source.page_content}</p> | |
</div> | |
<div class="card-footer"> | |
<span>{meta['file_source_type']}</span> | |
<span>Relevance Score : {round(100*score,1)}%</span> | |
</div> | |
</div> | |
""" | |
if meta["file_source_type"] == "Presse": | |
if meta["file_url"] != "none": | |
return f""" | |
<div class="card" id="doc{i}"> | |
<div class="card-content"> | |
<h2>Doc {i} - {meta['file_title']} - {meta['file_publisher']}</h2> | |
<p>{source.page_content}</p> | |
</div> | |
<div class="card-footer"> | |
<span>{meta['file_source_type']}</span> | |
<span>Relevance Score : {round(100*score,1)}%</span> | |
<a href={meta['file_url']} target="_blank"> | |
<span role="img" aria-label="Open PDF">🔗</span> | |
</a> | |
</div> | |
</div> | |
""" | |
else: | |
return f""" | |
<div class="card" id="doc{i}"> | |
<div class="card-content"> | |
<h2>Doc {i} - {meta['file_title']} - {meta['file_publisher']}</h2> | |
<p>{source.page_content}</p> | |
</div> | |
<div class="card-footer"> | |
<span>{meta['file_source_type']}</span> | |
<span>Relevance Score : {round(100*score,1)}%</span> | |
</div> | |
</div> | |
""" | |
if meta["file_url"]: | |
return f""" | |
<div class="card" id="doc{i}"> | |
<div class="card-content"> | |
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2> | |
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p> | |
</div> | |
<div class="card-footer"> | |
<span>{meta['file_source_type']}</span> | |
<span>Relevance Score : {round(100*score,1)}%</span> | |
<a href="{get_source_link(meta)}" target="_blank"> | |
<span role="img" aria-label="Open PDF">🔗</span> | |
</a> | |
</div> | |
</div> | |
""" | |
else: | |
return f""" | |
<div class="card" id="doc{i}"> | |
<div class="card-content"> | |
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2> | |
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p> | |
</div> | |
<div class="card-footer"> | |
<span>{meta['file_source_type']}</span> | |
<span>Relevance Score : {round(100*score,1)}%</span> | |
</div> | |
</div> | |
""" | |
def parse_output_llm_with_sources(output): | |
content_parts = re.split( | |
r"[\[(]?(Doc\s?\d+(?:,\s?Doc\s?\d+)*|doc\s?\d+(?:,\s?doc\s?\d+)*|Doc\s\d+)[\])?]", | |
output, | |
) | |
parts = [] | |
for part in content_parts: | |
if part.lower().startswith("doc"): | |
subparts = part.split(",") | |
subparts = [ | |
subpart.lower().replace("doc", "").strip() for subpart in subparts | |
] | |
subparts = [ | |
f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" | |
for subpart in subparts | |
] | |
parts.append("".join(subparts)) | |
else: | |
parts.append(part) | |
content_parts = "".join(parts) | |
return content_parts | |
def clear_text_box(textbox): | |
return "" | |
def add_text(chatbot, text): | |
chatbot = chatbot + [(text, None)] | |
return chatbot, text | |
def init_env(): | |
try: | |
load_dotenv() | |
except: | |
pass | |