Alimubariz124 commited on
Commit
4a5279c
·
verified ·
1 Parent(s): c8f6c2d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import PyPDF2
4
+ import numpy as np
5
+ import faiss
6
+ import torch
7
+ import streamlit as st
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
9
+ from datasets import Dataset
10
+ from sentence_transformers import SentenceTransformer
11
+ from peft import LoraConfig, get_peft_model
12
+
13
+ # Load embedding model
14
+ @st.cache_resource
15
+ def load_embedding_model():
16
+ return SentenceTransformer("all-MiniLM-L6-v2")
17
+
18
+ # Parse PDF file
19
+ def parse_pdf(file):
20
+ pdf_reader = PyPDF2.PdfReader(file)
21
+ text = ""
22
+ for page in pdf_reader.pages:
23
+ text += page.extract_text()
24
+ return text
25
+
26
+ # Split text into chunks
27
+ def split_text(text, chunk_size=500):
28
+ return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
29
+
30
+ # Create FAISS index
31
+ def create_faiss_index(embeddings):
32
+ dimension = embeddings.shape[1]
33
+ index = faiss.IndexFlatL2(dimension)
34
+ index.add(embeddings)
35
+ return index
36
+
37
+ # Fine-tune the model
38
+ def fine_tune_model(dataset, model_name, output_dir="./fine-tuned-model"):
39
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
40
+ model = AutoModelForCausalLM.from_pretrained(model_name)
41
+
42
+ def preprocess_function(examples):
43
+ inputs = [f"Question: {q} Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
44
+ return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
45
+
46
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
47
+
48
+ training_args = TrainingArguments(
49
+ output_dir=output_dir,
50
+ per_device_train_batch_size=4,
51
+ num_train_epochs=3,
52
+ save_steps=10_000,
53
+ save_total_limit=2,
54
+ )
55
+
56
+ trainer = Trainer(
57
+ model=model,
58
+ args=training_args,
59
+ train_dataset=tokenized_dataset,
60
+ tokenizer=tokenizer,
61
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
62
+ )
63
+
64
+ trainer.train()
65
+ model.save_pretrained(output_dir)
66
+ tokenizer.save_pretrained(output_dir)
67
+ return output_dir
68
+
69
+ # Generate response from the model
70
+ def generate_response(prompt, model, tokenizer):
71
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
72
+ outputs = model.generate(**inputs, max_length=512, num_return_sequences=1)
73
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
74
+ return response
75
+
76
+ # Main Streamlit app
77
+ def main():
78
+ st.title("Chat with PDF using Fine-Tuned Llama Model")
79
+
80
+ # Step 1: Upload PDF file
81
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
82
+ if uploaded_file is not None:
83
+ st.write("File uploaded successfully!")
84
+
85
+ # Process PDF
86
+ with st.spinner("Processing PDF..."):
87
+ pdf_text = parse_pdf(uploaded_file)
88
+ chunks = split_text(pdf_text)
89
+ embedding_model = load_embedding_model()
90
+ chunk_embeddings = embedding_model.encode(chunks)
91
+ faiss_index = create_faiss_index(np.array(chunk_embeddings))
92
+
93
+ st.success("PDF processed! Proceed to fine-tuning.")
94
+
95
+ # Step 2: Fine-tune the model
96
+ if st.button("Fine-Tune Model"):
97
+ with st.spinner("Fine-tuning the model..."):
98
+ # Create a dataset of question-answer pairs
99
+ qa_pairs = []
100
+ for chunk in chunks:
101
+ qa_pairs.append({"question": "What is this about?", "answer": chunk[:100]}) # Simplified example
102
+
103
+ dataset = Dataset.from_dict({
104
+ "question": [pair["question"] for pair in qa_pairs],
105
+ "answer": [pair["answer"] for pair in qa_pairs],
106
+ })
107
+
108
+ # Fine-tune the model
109
+ model_name = "meta-llama/Llama-2-7b-chat-hf" # Replace with your local path
110
+ fine_tuned_model_path = fine_tune_model(dataset, model_name)
111
+
112
+ st.success(f"Model fine-tuned! Saved at: {fine_tuned_model_path}")
113
+
114
+ # Load the fine-tuned model
115
+ tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
116
+ model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path, device_map="auto", torch_dtype=torch.float16)
117
+
118
+ st.success("Fine-tuned model loaded! You can now ask questions.")
119
+
120
+ # Step 3: Chat interface
121
+ user_input = st.text_input("Ask a question about the PDF:")
122
+ if user_input:
123
+ with st.spinner("Generating response..."):
124
+ # Retrieve relevant chunk
125
+ query_embedding = embedding_model.encode([user_input])
126
+ _, indices = faiss_index.search(query_embedding, k=1)
127
+ relevant_chunk = chunks[indices[0][0]]
128
+
129
+ # Generate response
130
+ prompt = f"Context: {relevant_chunk}\nQuestion: {user_input}\nAnswer:"
131
+ response = generate_response(prompt, model, tokenizer)
132
+
133
+ st.write(f"**Response:** {response}")
134
+
135
+ if __name__ == "__main__":
136
+ main()