Spaces:

ritampatra
/

Document_chatbot

Sleeping

App Files Files Community

ritampatra commited on Sep 22, 2024

Commit

5ab0b92

verified ·

1 Parent(s): c3aa5e6

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -8

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import gradio as gr
-from transformers import pipeline
-from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.document_loaders import PyPDFLoader
 from langchain.chains.question_answering import load_qa_chain
 from langchain.llms import HuggingFaceHub
 # Function to load and process the document (PDF)
 def load_document(file):
@@ -12,16 +12,32 @@ def load_document(file):
     documents = loader.load()
     return documents
-# Function to embed the documents using sentence-transformers and store them in FAISS
 def embed_documents(documents):
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
-    vector_store = FAISS.from_documents(documents, embeddings)
     return vector_store
-# Function to handle the chatbot's conversation by querying the document embeddings
 def chat_with_document(query, vector_store):
     retriever = vector_store.as_retriever()
-    llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":0.2})
     chain = load_qa_chain(llm, chain_type="stuff")
     results = retriever.get_relevant_documents(query)
     answer = chain.run(input_documents=results, question=query)
@@ -61,7 +77,7 @@ def chatbot_interface():
         with gr.Row():
             question.render()
             answer.render()
     # Launch the Gradio app
     demo.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModel, pipeline
 from langchain.vectorstores import FAISS
 from langchain.document_loaders import PyPDFLoader
 from langchain.chains.question_answering import load_qa_chain
 from langchain.llms import HuggingFaceHub
+import torch
 # Function to load and process the document (PDF)
 def load_document(file):
     documents = loader.load()
     return documents
+# Function to embed documents using Hugging Face model directly
 def embed_documents(documents):
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
+    model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
+    # Get document texts
+    document_texts = [doc.page_content for doc in documents]
+    # Create embeddings for each document
+    embeddings = []
+    for text in document_texts:
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            model_output = model(**inputs)
+        embedding = model_output.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
+        embeddings.append(embedding.squeeze().numpy())
+    # Store embeddings in FAISS vector store
+    vector_store = FAISS.from_embeddings(embeddings, documents)
     return vector_store
+# Function to handle chatbot queries
 def chat_with_document(query, vector_store):
     retriever = vector_store.as_retriever()
+    llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 0.2})
     chain = load_qa_chain(llm, chain_type="stuff")
     results = retriever.get_relevant_documents(query)
     answer = chain.run(input_documents=results, question=query)
         with gr.Row():
             question.render()
             answer.render()
     # Launch the Gradio app
     demo.launch()