Spaces:

ritampatra
/

Document_chatbot

Sleeping

App Files Files Community

ritampatra commited on Sep 22, 2024

Commit

879e1ad

verified ·

1 Parent(s): 5ab0b92

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -49

app.py CHANGED Viewed

@@ -1,84 +1,96 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModel, pipeline
-from langchain.vectorstores import FAISS
-from langchain.document_loaders import PyPDFLoader
-from langchain.chains.question_answering import load_qa_chain
-from langchain.llms import HuggingFaceHub
 import torch
-# Function to load and process the document (PDF)
 def load_document(file):
-    loader = PyPDFLoader(file.name)
-    documents = loader.load()
-    return documents
-# Function to embed documents using Hugging Face model directly
-def embed_documents(documents):
-    # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
     model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
-    # Get document texts
-    document_texts = [doc.page_content for doc in documents]
-    # Create embeddings for each document
-    embeddings = []
-    for text in document_texts:
-        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-        with torch.no_grad():
-            model_output = model(**inputs)
-        embedding = model_output.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
-        embeddings.append(embedding.squeeze().numpy())
-    # Store embeddings in FAISS vector store
-    vector_store = FAISS.from_embeddings(embeddings, documents)
-    return vector_store
-# Function to handle chatbot queries
-def chat_with_document(query, vector_store):
-    retriever = vector_store.as_retriever()
-    llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 0.2})
-    chain = load_qa_chain(llm, chain_type="stuff")
-    results = retriever.get_relevant_documents(query)
-    answer = chain.run(input_documents=results, question=query)
-    return answer
-# Function to build the Gradio interface
 def chatbot_interface():
-    vector_store = None
-    # Function to handle file upload and document embedding
     def upload_file(file):
-        nonlocal vector_store
-        documents = load_document(file)
-        vector_store = embed_documents(documents)
-        return "Document uploaded and processed. You can now ask questions."
     # Function to handle user queries
     def ask_question(query):
-        if vector_store:
-            return chat_with_document(query, vector_store)
         return "Please upload a document first."
-    # Gradio interface components
     upload = gr.File(label="Upload a PDF document")
     question = gr.Textbox(label="Ask a question about the document")
     answer = gr.Textbox(label="Answer", readonly=True)
-    # Linking the functions to Gradio interface
-    upload_button = gr.Interface(fn=upload_file, inputs=upload, outputs="text")
-    chat_box = gr.Interface(fn=ask_question, inputs=question, outputs=answer)
     # Gradio app layout
     with gr.Blocks() as demo:
         gr.Markdown("# Document Chatbot")
         with gr.Row():
-            upload_button.render()
         with gr.Row():
             question.render()
             answer.render()
-    # Launch the Gradio app
     demo.launch()
 # Start the chatbot interface

 import gradio as gr
+from transformers import AutoTokenizer, AutoModel
+import faiss
+import numpy as np
 import torch
+from PyPDF2 import PdfReader
+# Load PDF and extract text from it
 def load_document(file):
+    pdf = PdfReader(file)
+    text = ''
+    for page_num in range(len(pdf.pages)):
+        page = pdf.pages[page_num]
+        text += page.extract_text()
+    return text
+# Embed the document using Hugging Face model
+def embed_text(text):
+    # Load tokenizer and model from Hugging Face
     tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
     model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
+    # Tokenize and embed text
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling to get the embedding
+    return embeddings.squeeze().numpy()
+# Initialize FAISS index
+def initialize_faiss(embedding_size):
+    index = faiss.IndexFlatL2(embedding_size)
+    return index
+# Add document embeddings to FAISS index
+def add_to_index(index, embeddings):
+    index.add(embeddings)
+# Search the FAISS index for the best matching text
+def search_index(index, query_embedding, texts, top_k=3):
+    distances, indices = index.search(np.array([query_embedding]), top_k)
+    return [texts[i] for i in indices[0]]
+# Process the document and build the FAISS index
+def process_document(file):
+    text = load_document(file)
+    chunks = [text[i:i + 512] for i in range(0, len(text), 512)]  # Split text into chunks
+    embeddings = np.vstack([embed_text(chunk) for chunk in chunks])  # Create embeddings for each chunk
+    faiss_index = initialize_faiss(embeddings.shape[1])  # Initialize FAISS index
+    add_to_index(faiss_index, embeddings)  # Add embeddings to FAISS index
+    return faiss_index, chunks
+# Answer query by searching FAISS index
+def query_document(query, faiss_index, document_chunks):
+    query_embedding = embed_text(query)  # Embed query
+    results = search_index(faiss_index, query_embedding, document_chunks)  # Search for the best matching chunks
+    return "\n\n".join(results)  # Return the matching document parts
+# Gradio interface
 def chatbot_interface():
+    faiss_index = None
+    document_chunks = None
+    # Function to handle document upload
     def upload_file(file):
+        nonlocal faiss_index, document_chunks
+        faiss_index, document_chunks = process_document(file)
+        return "Document uploaded and indexed. You can now ask questions."
     # Function to handle user queries
     def ask_question(query):
+        if faiss_index and document_chunks:
+            return query_document(query, faiss_index, document_chunks)
         return "Please upload a document first."
+    # Gradio UI
     upload = gr.File(label="Upload a PDF document")
     question = gr.Textbox(label="Ask a question about the document")
     answer = gr.Textbox(label="Answer", readonly=True)
     # Gradio app layout
     with gr.Blocks() as demo:
         gr.Markdown("# Document Chatbot")
         with gr.Row():
+            upload.render()
         with gr.Row():
             question.render()
             answer.render()
+    # Bind upload and question functionality
+    upload.upload(upload_file)
+    question.submit(ask_question, inputs=question, outputs=answer)
     demo.launch()
 # Start the chatbot interface