from dotenv import load_dotenv
import streamlit as st
import pickle
from PyPDF2 import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import os
import numpy as np
# Load environment variables from .env file
load_dotenv()
# Define a function to manually chunk text
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
chunks = []
i = 0
while i < len(text):
chunks.append(text[i:i + chunk_size])
i += chunk_size - chunk_overlap
return chunks
# Function to generate embeddings using sentence-transformers
def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
embeddings = model.encode(text_chunks, convert_to_tensor=False)
return embeddings
# Function to find the most relevant chunk based on the cosine similarity
def find_best_chunk(query_embedding, text_embeddings):
cosine_similarities = np.dot(text_embeddings, query_embedding) / (
np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
)
best_index = np.argmax(cosine_similarities)
return best_index, cosine_similarities[best_index]
# Main Streamlit app function
def main():
st.header("LLM-powered PDF Chatbot 💬")
# Upload a PDF file
pdf = st.file_uploader("Upload your PDF", type='pdf')
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Split text into chunks
chunks = chunk_text(text)
# Generate embeddings for the chunks
store_name = pdf.name[:-4]
st.write(f'{store_name}')
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
text_embeddings = pickle.load(f)
st.write('Embeddings Loaded from the Disk')
else:
text_embeddings = generate_embeddings(chunks)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(text_embeddings, f)
# Accept user questions/query
query = st.text_input("Ask questions about your PDF file:")
if query:
# Generate embeddings for the query
query_embedding = generate_embeddings([query])[0]
# Find the best chunk for the query
best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
best_chunk = chunks[best_index]
# Use Hugging Face pipeline for question answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
result = qa_pipeline(question=query, context=best_chunk)
st.write(result['answer'])
def set_bg_from_url(url, opacity=1):
footer = """
"""
st.markdown(footer, unsafe_allow_html=True)
# Set background image using HTML and CSS
st.markdown(
f"""
""",
unsafe_allow_html=True
)
# Set background image from URL
set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5)
if __name__ == '__main__':
main()