import gradio as gr from transformers import AutoTokenizer, AutoModel import faiss import numpy as np import torch from PyPDF2 import PdfReader # Load PDF and extract text from it def load_document(file): pdf = PdfReader(file) text = '' for page_num in range(len(pdf.pages)): page = pdf.pages[page_num] text += page.extract_text() return text # Embed the document using Hugging Face model def embed_text(text): # Load tokenizer and model from Hugging Face tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2") model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2") # Tokenize and embed text inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling to get the embedding return embeddings.squeeze().numpy() # Initialize FAISS index def initialize_faiss(embedding_size): index = faiss.IndexFlatL2(embedding_size) return index # Add document embeddings to FAISS index def add_to_index(index, embeddings): index.add(embeddings) # Search the FAISS index for the best matching text def search_index(index, query_embedding, texts, top_k=3): distances, indices = index.search(np.array([query_embedding]), top_k) return [texts[i] for i in indices[0]] # Process the document and build the FAISS index def process_document(file): text = load_document(file) chunks = [text[i:i + 512] for i in range(0, len(text), 512)] # Split text into chunks embeddings = np.vstack([embed_text(chunk) for chunk in chunks]) # Create embeddings for each chunk faiss_index = initialize_faiss(embeddings.shape[1]) # Initialize FAISS index add_to_index(faiss_index, embeddings) # Add embeddings to FAISS index return faiss_index, chunks # Answer query by searching FAISS index def query_document(query, faiss_index, document_chunks): query_embedding = embed_text(query) # Embed query results = search_index(faiss_index, query_embedding, document_chunks) # Search for the best matching chunks return "\n\n".join(results) # Return the matching document parts # Gradio interface def chatbot_interface(): faiss_index = None document_chunks = None # Function to handle document upload def upload_file(file): nonlocal faiss_index, document_chunks faiss_index, document_chunks = process_document(file) return "Document uploaded and indexed. You can now ask questions." # Function to handle user queries def ask_question(query): if faiss_index and document_chunks: return query_document(query, faiss_index, document_chunks) return "Please upload a document first." # Gradio UI upload = gr.File(label="Upload a PDF document") question = gr.Textbox(label="Ask a question about the document") answer = gr.Textbox(label="Answer", readonly=True) # Gradio app layout with gr.Blocks() as demo: gr.Markdown("# Document Chatbot") with gr.Row(): upload.render() with gr.Row(): question.render() answer.render() # Bind upload and question functionality upload.upload(upload_file) question.submit(ask_question, inputs=question, outputs=answer) demo.launch() # Start the chatbot interface if __name__ == "__main__": chatbot_interface()