Spaces:

mobinln
/

pdf_qa

Sleeping

File size: 1,755 Bytes

3af157b

import streamlit as st
import pathlib

from huggingface_hub import hf_hub_download
from langchain_community.llms import LlamaCpp
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


@st.cache_resource()
def load_llm(repo_id, filename):
    # Create a directory for models if it doesn't exist
    models_folder = pathlib.Path("models")
    models_folder.mkdir(exist_ok=True)

    # Download the model
    model_path = hf_hub_download(
        repo_id=repo_id, filename=filename, local_dir=models_folder
    )

    llm = LlamaCpp(
        model_path=model_path,
        repo_id=repo_id,
        filename=filename,
        verbose=False,
        use_mmap=True,
        use_mlock=True,
        n_threads=4,
        n_threads_batch=4,
        n_ctx=8000,
    )
    print(f"{repo_id} loaded successfully. ✅")
    return llm


# Streamed response emulator
def response_generator(llm, messages, question, retriever):
    system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        "{context}"
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("user", "{input}"),
        ]
    )

    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)

    results = rag_chain.invoke({"input": question})

    return results