Spaces:

Alimubariz124
/

RAG

Sleeping

App Files Files Community

Alimubariz124 commited on Mar 18

Commit

4a5279c

verified ·

1 Parent(s): c8f6c2d

Create app.py

Browse files

Files changed (1) hide show

app.py +136 -0

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import pickle
+import PyPDF2
+import numpy as np
+import faiss
+import torch
+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
+from datasets import Dataset
+from sentence_transformers import SentenceTransformer
+from peft import LoraConfig, get_peft_model
+# Load embedding model
+@st.cache_resource
+def load_embedding_model():
+    return SentenceTransformer("all-MiniLM-L6-v2")
+# Parse PDF file
+def parse_pdf(file):
+    pdf_reader = PyPDF2.PdfReader(file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Split text into chunks
+def split_text(text, chunk_size=500):
+    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+# Create FAISS index
+def create_faiss_index(embeddings):
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index
+# Fine-tune the model
+def fine_tune_model(dataset, model_name, output_dir="./fine-tuned-model"):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    def preprocess_function(examples):
+        inputs = [f"Question: {q} Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
+        return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
+    tokenized_dataset = dataset.map(preprocess_function, batched=True)
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        per_device_train_batch_size=4,
+        num_train_epochs=3,
+        save_steps=10_000,
+        save_total_limit=2,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset,
+        tokenizer=tokenizer,
+        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+    )
+    trainer.train()
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    return output_dir
+# Generate response from the model
+def generate_response(prompt, model, tokenizer):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_length=512, num_return_sequences=1)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return response
+# Main Streamlit app
+def main():
+    st.title("Chat with PDF using Fine-Tuned Llama Model")
+    # Step 1: Upload PDF file
+    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+    if uploaded_file is not None:
+        st.write("File uploaded successfully!")
+        # Process PDF
+        with st.spinner("Processing PDF..."):
+            pdf_text = parse_pdf(uploaded_file)
+            chunks = split_text(pdf_text)
+            embedding_model = load_embedding_model()
+            chunk_embeddings = embedding_model.encode(chunks)
+            faiss_index = create_faiss_index(np.array(chunk_embeddings))
+        st.success("PDF processed! Proceed to fine-tuning.")
+        # Step 2: Fine-tune the model
+        if st.button("Fine-Tune Model"):
+            with st.spinner("Fine-tuning the model..."):
+                # Create a dataset of question-answer pairs
+                qa_pairs = []
+                for chunk in chunks:
+                    qa_pairs.append({"question": "What is this about?", "answer": chunk[:100]})  # Simplified example
+                dataset = Dataset.from_dict({
+                    "question": [pair["question"] for pair in qa_pairs],
+                    "answer": [pair["answer"] for pair in qa_pairs],
+                })
+                # Fine-tune the model
+                model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your local path
+                fine_tuned_model_path = fine_tune_model(dataset, model_name)
+            st.success(f"Model fine-tuned! Saved at: {fine_tuned_model_path}")
+            # Load the fine-tuned model
+            tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
+            model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path, device_map="auto", torch_dtype=torch.float16)
+            st.success("Fine-tuned model loaded! You can now ask questions.")
+        # Step 3: Chat interface
+        user_input = st.text_input("Ask a question about the PDF:")
+        if user_input:
+            with st.spinner("Generating response..."):
+                # Retrieve relevant chunk
+                query_embedding = embedding_model.encode([user_input])
+                _, indices = faiss_index.search(query_embedding, k=1)
+                relevant_chunk = chunks[indices[0][0]]
+                # Generate response
+                prompt = f"Context: {relevant_chunk}\nQuestion: {user_input}\nAnswer:"
+                response = generate_response(prompt, model, tokenizer)
+            st.write(f"**Response:** {response}")
+if __name__ == "__main__":
+    main()