ritampatra's picture
Update app.py
879e1ad verified
raw
history blame
3.51 kB
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import torch
from PyPDF2 import PdfReader
# Load PDF and extract text from it
def load_document(file):
pdf = PdfReader(file)
text = ''
for page_num in range(len(pdf.pages)):
page = pdf.pages[page_num]
text += page.extract_text()
return text
# Embed the document using Hugging Face model
def embed_text(text):
# Load tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
# Tokenize and embed text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling to get the embedding
return embeddings.squeeze().numpy()
# Initialize FAISS index
def initialize_faiss(embedding_size):
index = faiss.IndexFlatL2(embedding_size)
return index
# Add document embeddings to FAISS index
def add_to_index(index, embeddings):
index.add(embeddings)
# Search the FAISS index for the best matching text
def search_index(index, query_embedding, texts, top_k=3):
distances, indices = index.search(np.array([query_embedding]), top_k)
return [texts[i] for i in indices[0]]
# Process the document and build the FAISS index
def process_document(file):
text = load_document(file)
chunks = [text[i:i + 512] for i in range(0, len(text), 512)] # Split text into chunks
embeddings = np.vstack([embed_text(chunk) for chunk in chunks]) # Create embeddings for each chunk
faiss_index = initialize_faiss(embeddings.shape[1]) # Initialize FAISS index
add_to_index(faiss_index, embeddings) # Add embeddings to FAISS index
return faiss_index, chunks
# Answer query by searching FAISS index
def query_document(query, faiss_index, document_chunks):
query_embedding = embed_text(query) # Embed query
results = search_index(faiss_index, query_embedding, document_chunks) # Search for the best matching chunks
return "\n\n".join(results) # Return the matching document parts
# Gradio interface
def chatbot_interface():
faiss_index = None
document_chunks = None
# Function to handle document upload
def upload_file(file):
nonlocal faiss_index, document_chunks
faiss_index, document_chunks = process_document(file)
return "Document uploaded and indexed. You can now ask questions."
# Function to handle user queries
def ask_question(query):
if faiss_index and document_chunks:
return query_document(query, faiss_index, document_chunks)
return "Please upload a document first."
# Gradio UI
upload = gr.File(label="Upload a PDF document")
question = gr.Textbox(label="Ask a question about the document")
answer = gr.Textbox(label="Answer", readonly=True)
# Gradio app layout
with gr.Blocks() as demo:
gr.Markdown("# Document Chatbot")
with gr.Row():
upload.render()
with gr.Row():
question.render()
answer.render()
# Bind upload and question functionality
upload.upload(upload_file)
question.submit(ask_question, inputs=question, outputs=answer)
demo.launch()
# Start the chatbot interface
if __name__ == "__main__":
chatbot_interface()