File size: 1,755 Bytes
3af157b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import streamlit as st
import pathlib
from huggingface_hub import hf_hub_download
from langchain_community.llms import LlamaCpp
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
@st.cache_resource()
def load_llm(repo_id, filename):
# Create a directory for models if it doesn't exist
models_folder = pathlib.Path("models")
models_folder.mkdir(exist_ok=True)
# Download the model
model_path = hf_hub_download(
repo_id=repo_id, filename=filename, local_dir=models_folder
)
llm = LlamaCpp(
model_path=model_path,
repo_id=repo_id,
filename=filename,
verbose=False,
use_mmap=True,
use_mlock=True,
n_threads=4,
n_threads_batch=4,
n_ctx=8000,
)
print(f"{repo_id} loaded successfully. ✅")
return llm
# Streamed response emulator
def response_generator(llm, messages, question, retriever):
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. If you don't know the answer, say that you "
"don't know. Use three sentences maximum and keep the "
"answer concise."
"\n\n"
"{context}"
)
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("user", "{input}"),
]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
results = rag_chain.invoke({"input": question})
return results
|