import gradio as gr
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import torch
from PyPDF2 import PdfReader

# Load PDF and extract text from it
def load_document(file):
    pdf = PdfReader(file)
    text = ''
    for page_num in range(len(pdf.pages)):
        page = pdf.pages[page_num]
        text += page.extract_text()
    return text

# Embed the document using Hugging Face model
def embed_text(text):
    # Load tokenizer and model from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
    model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
    
    # Tokenize and embed text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling to get the embedding
    return embeddings.squeeze().numpy()

# Initialize FAISS index
def initialize_faiss(embedding_size):
    index = faiss.IndexFlatL2(embedding_size)
    return index

# Add document embeddings to FAISS index
def add_to_index(index, embeddings):
    index.add(embeddings)

# Search the FAISS index for the best matching text
def search_index(index, query_embedding, texts, top_k=3):
    distances, indices = index.search(np.array([query_embedding]), top_k)
    return [texts[i] for i in indices[0]]

# Process the document and build the FAISS index
def process_document(file):
    text = load_document(file)
    chunks = [text[i:i + 512] for i in range(0, len(text), 512)]  # Split text into chunks
    embeddings = np.vstack([embed_text(chunk) for chunk in chunks])  # Create embeddings for each chunk
    faiss_index = initialize_faiss(embeddings.shape[1])  # Initialize FAISS index
    add_to_index(faiss_index, embeddings)  # Add embeddings to FAISS index
    return faiss_index, chunks

# Answer query by searching FAISS index
def query_document(query, faiss_index, document_chunks):
    query_embedding = embed_text(query)  # Embed query
    results = search_index(faiss_index, query_embedding, document_chunks)  # Search for the best matching chunks
    return "\n\n".join(results)  # Return the matching document parts

# Gradio interface
def chatbot_interface():
    faiss_index = None
    document_chunks = None

    # Function to handle document upload
    def upload_file(file):
        nonlocal faiss_index, document_chunks
        faiss_index, document_chunks = process_document(file)
        return "Document uploaded and indexed. You can now ask questions."

    # Function to handle user queries
    def ask_question(query):
        if faiss_index and document_chunks:
            return query_document(query, faiss_index, document_chunks)
        return "Please upload a document first."

    # Gradio UI
    upload = gr.File(label="Upload a PDF document")
    question = gr.Textbox(label="Ask a question about the document")
    answer = gr.Textbox(label="Answer", readonly=True)

    # Gradio app layout
    with gr.Blocks() as demo:
        gr.Markdown("# Document Chatbot")
        with gr.Row():
            upload.render()
        with gr.Row():
            question.render()
            answer.render()

    # Bind upload and question functionality
    upload.upload(upload_file)
    question.submit(ask_question, inputs=question, outputs=answer)

    demo.launch()

# Start the chatbot interface
if __name__ == "__main__":
    chatbot_interface()