from dotenv import load_dotenv import streamlit as st import pickle from PyPDF2 import PdfReader from transformers import pipeline from sentence_transformers import SentenceTransformer import os import numpy as np # Load environment variables from .env file load_dotenv() # Define a function to manually chunk text def chunk_text(text, chunk_size=1000, chunk_overlap=200): chunks = [] i = 0 while i < len(text): chunks.append(text[i:i + chunk_size]) i += chunk_size - chunk_overlap return chunks # Function to generate embeddings using sentence-transformers def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'): model = SentenceTransformer(model_name) embeddings = model.encode(text_chunks, convert_to_tensor=False) return embeddings # Function to find the most relevant chunk based on the cosine similarity def find_best_chunk(query_embedding, text_embeddings): cosine_similarities = np.dot(text_embeddings, query_embedding) / ( np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding) ) best_index = np.argmax(cosine_similarities) return best_index, cosine_similarities[best_index] # Main Streamlit app function def main(): st.header("LLM-powered PDF Chatbot 💬") # Upload a PDF file pdf = st.file_uploader("Upload your PDF", type='pdf') if pdf is not None: pdf_reader = PdfReader(pdf) text = "" for page in pdf_reader.pages: text += page.extract_text() # Split text into chunks chunks = chunk_text(text) # Generate embeddings for the chunks store_name = pdf.name[:-4] st.write(f'{store_name}') if os.path.exists(f"{store_name}.pkl"): with open(f"{store_name}.pkl", "rb") as f: text_embeddings = pickle.load(f) st.write('Embeddings Loaded from the Disk') else: text_embeddings = generate_embeddings(chunks) with open(f"{store_name}.pkl", "wb") as f: pickle.dump(text_embeddings, f) # Accept user questions/query query = st.text_input("Ask questions about your PDF file:") if query: # Generate embeddings for the query query_embedding = generate_embeddings([query])[0] # Find the best chunk for the query best_index, similarity = find_best_chunk(query_embedding, text_embeddings) best_chunk = chunks[best_index] # Use Hugging Face pipeline for question answering qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") result = qa_pipeline(question=query, context=best_chunk) st.write(result['answer']) def set_bg_from_url(url, opacity=1): footer = """ """ st.markdown(footer, unsafe_allow_html=True) # Set background image using HTML and CSS st.markdown( f""" """, unsafe_allow_html=True ) # Set background image from URL set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5) if __name__ == '__main__': main()