Spaces:

ritampatra
/

Document_chatbot

Sleeping

App Files Files Community

Document_chatbot / app.py

ritampatra

Update app.py

879e1ad verified 5 months ago

raw

history blame

3.51 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModel
	import faiss
	import numpy as np
	import torch
	from PyPDF2 import PdfReader

	# Load PDF and extract text from it
	def load_document(file):
	pdf = PdfReader(file)
	text = ''
	for page_num in range(len(pdf.pages)):
	page = pdf.pages[page_num]
	text += page.extract_text()
	return text

	# Embed the document using Hugging Face model
	def embed_text(text):
	# Load tokenizer and model from Hugging Face
	tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
	model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

	# Tokenize and embed text
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling to get the embedding
	return embeddings.squeeze().numpy()

	# Initialize FAISS index
	def initialize_faiss(embedding_size):
	index = faiss.IndexFlatL2(embedding_size)
	return index

	# Add document embeddings to FAISS index
	def add_to_index(index, embeddings):
	index.add(embeddings)

	# Search the FAISS index for the best matching text
	def search_index(index, query_embedding, texts, top_k=3):
	distances, indices = index.search(np.array([query_embedding]), top_k)
	return [texts[i] for i in indices[0]]

	# Process the document and build the FAISS index
	def process_document(file):
	text = load_document(file)
	chunks = [text[i:i + 512] for i in range(0, len(text), 512)] # Split text into chunks
	embeddings = np.vstack([embed_text(chunk) for chunk in chunks]) # Create embeddings for each chunk
	faiss_index = initialize_faiss(embeddings.shape[1]) # Initialize FAISS index
	add_to_index(faiss_index, embeddings) # Add embeddings to FAISS index
	return faiss_index, chunks

	# Answer query by searching FAISS index
	def query_document(query, faiss_index, document_chunks):
	query_embedding = embed_text(query) # Embed query
	results = search_index(faiss_index, query_embedding, document_chunks) # Search for the best matching chunks
	return "\n\n".join(results) # Return the matching document parts

	# Gradio interface
	def chatbot_interface():
	faiss_index = None
	document_chunks = None

	# Function to handle document upload
	def upload_file(file):
	nonlocal faiss_index, document_chunks
	faiss_index, document_chunks = process_document(file)
	return "Document uploaded and indexed. You can now ask questions."

	# Function to handle user queries
	def ask_question(query):
	if faiss_index and document_chunks:
	return query_document(query, faiss_index, document_chunks)
	return "Please upload a document first."

	# Gradio UI
	upload = gr.File(label="Upload a PDF document")
	question = gr.Textbox(label="Ask a question about the document")
	answer = gr.Textbox(label="Answer", readonly=True)

	# Gradio app layout
	with gr.Blocks() as demo:
	gr.Markdown("# Document Chatbot")
	with gr.Row():
	upload.render()
	with gr.Row():
	question.render()
	answer.render()

	# Bind upload and question functionality
	upload.upload(upload_file)
	question.submit(ask_question, inputs=question, outputs=answer)

	demo.launch()

	# Start the chatbot interface
	if __name__ == "__main__":
	chatbot_interface()