Alimubariz124 commited on
Commit
c1f6043
·
verified ·
1 Parent(s): 3b5783e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -59
app.py CHANGED
@@ -1,73 +1,71 @@
1
  import gradio as gr
2
- from model_loader import load_embedding_model, load_llm
3
- from transcript_handler import chunk_text, embed_chunks, create_faiss_index
4
- from qa_engine import query_faiss, build_prompt
 
5
 
6
  # Load models
7
- embedder = load_embedding_model()
8
- llm = load_llm()
 
 
 
9
 
10
- # Main Gradio app
11
- with gr.Blocks() as demo:
12
- gr.Markdown("# 📄 Chat with a Transcript (Open Source + Free!)")
13
-
14
- # State variables for storing index and chunks per session
15
- index_state = gr.State(None)
16
- chunks_state = gr.State([])
17
 
18
- transcript_input = gr.File(label="Upload Transcript (.txt)")
19
- upload_button = gr.Button("Upload and Process")
20
- query_input = gr.Textbox(label="Ask a question about the transcript")
21
- answer_output = gr.Textbox(label="Answer")
22
 
23
- def upload_transcript(file, chunks_state):
24
- try:
25
- # Read and decode the file
26
- text = file.read().decode("utf-8")
27
- if not text.strip():
28
- return "Error: Uploaded file is empty.", None, []
29
-
30
- # Preprocess the transcript
31
- text = preprocess_transcript(text)
32
- if not text.strip():
33
- return "Error: Transcript is empty after preprocessing.", None, []
34
-
35
- # Chunk the text
36
- chunks = chunk_text(text)
37
- if not chunks:
38
- return "Error: No chunks generated from the transcript.", None, []
39
-
40
- # Generate embeddings
41
- embeddings, chunks = embed_chunks(chunks, embedder)
42
- if embeddings.size == 0:
43
- return "Error: Failed to generate embeddings.", None, []
44
-
45
- # Create FAISS index
46
- index = create_faiss_index(embeddings)
47
- return "Transcript uploaded and indexed successfully!", index, chunks
48
- except Exception as e:
49
- return f"Error processing transcript: {str(e)}", None, []
50
 
51
-
52
 
53
- def chat_with_transcript(query, index_state, chunks_state):
54
- if index_state is None:
55
- return "Please upload a transcript first."
56
- context = query_faiss(query, index_state, embedder, chunks_state)
57
- prompt = build_prompt(context, query)
58
- response = llm(prompt)[0]['generated_text']
59
- if "Answer:" not in response:
60
- return "Error: Unable to parse the model's response."
61
- return response.split("Answer:")[-1].strip()
62
 
63
- upload_button.click(
64
- upload_transcript,
65
- inputs=[transcript_input, chunks_state],
66
- outputs=[answer_output, index_state, chunks_state]
67
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  query_input.submit(
69
  chat_with_transcript,
70
- inputs=[query_input, index_state, chunks_state],
71
  outputs=[answer_output]
72
  )
73
 
 
1
  import gradio as gr
2
+ from sentence_transformers import SentenceTransformer
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
+ import faiss
5
+ import numpy as np
6
 
7
  # Load models
8
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
9
+ model_name = "google/flan-t5-base"
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
12
+ llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
13
 
14
+ # Hardcoded transcript (5-10 lines)
15
+ transcript = """
16
+ The meeting started at 10 AM. The team discussed the new project timeline.
17
+ John mentioned that the deadline is tight but achievable. Sarah suggested adding more resources.
18
+ The team agreed to meet again tomorrow to finalize the plan.
19
+ """
 
20
 
21
+ # Preprocess and chunk the transcript
22
+ def preprocess_transcript(text):
23
+ return ' '.join(text.split()) # Remove extra whitespace
 
24
 
25
+ def chunk_text(text, chunk_size=300, overlap=50):
26
+ words = text.split()
27
+ chunks = []
28
+ for i in range(0, len(words), chunk_size - overlap):
29
+ chunk = ' '.join(words[i:i + chunk_size])
30
+ chunks.append(chunk)
31
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ chunks = chunk_text(preprocess_transcript(transcript))
34
 
35
+ # Generate embeddings and create FAISS index
36
+ embeddings = embedder.encode(chunks)
37
+ index = faiss.IndexFlatL2(embeddings.shape[1])
38
+ index.add(np.array(embeddings))
 
 
 
 
 
39
 
40
+ # Query the FAISS index
41
+ def query_faiss(query, index, embedder, chunks, top_k=2):
42
+ query_vector = embedder.encode([query])
43
+ D, I = index.search(np.array(query_vector), top_k)
44
+ retrieved_chunks = [chunks[i] for i in I[0]]
45
+ return "\n\n".join(retrieved_chunks)
46
+
47
+ # Build prompt and generate answer
48
+ def chat_with_transcript(query):
49
+ context = query_faiss(query, index, embedder, chunks)
50
+ prompt = f"""You are an AI assistant. Use the following context to answer the question.
51
+ Context:
52
+ {context}
53
+ Question: {query}
54
+ Answer:"""
55
+ response = llm(prompt)[0]['generated_text']
56
+ if "Answer:" not in response:
57
+ return "Error: Unable to parse the model's response."
58
+ return response.split("Answer:")[-1].strip()
59
+
60
+ # Gradio interface
61
+ with gr.Blocks() as demo:
62
+ gr.Markdown("# 📄 Chat with a Transcript")
63
+ query_input = gr.Textbox(label="Ask a question about the transcript")
64
+ answer_output = gr.Textbox(label="Answer")
65
+
66
  query_input.submit(
67
  chat_with_transcript,
68
+ inputs=[query_input],
69
  outputs=[answer_output]
70
  )
71