Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
CHANGED
@@ -1,73 +1,71 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
from
|
4 |
-
|
|
|
5 |
|
6 |
# Load models
|
7 |
-
embedder =
|
8 |
-
|
|
|
|
|
|
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
chunks_state = gr.State([])
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
answer_output = gr.Textbox(label="Answer")
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
# Preprocess the transcript
|
31 |
-
text = preprocess_transcript(text)
|
32 |
-
if not text.strip():
|
33 |
-
return "Error: Transcript is empty after preprocessing.", None, []
|
34 |
-
|
35 |
-
# Chunk the text
|
36 |
-
chunks = chunk_text(text)
|
37 |
-
if not chunks:
|
38 |
-
return "Error: No chunks generated from the transcript.", None, []
|
39 |
-
|
40 |
-
# Generate embeddings
|
41 |
-
embeddings, chunks = embed_chunks(chunks, embedder)
|
42 |
-
if embeddings.size == 0:
|
43 |
-
return "Error: Failed to generate embeddings.", None, []
|
44 |
-
|
45 |
-
# Create FAISS index
|
46 |
-
index = create_faiss_index(embeddings)
|
47 |
-
return "Transcript uploaded and indexed successfully!", index, chunks
|
48 |
-
except Exception as e:
|
49 |
-
return f"Error processing transcript: {str(e)}", None, []
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
prompt = build_prompt(context, query)
|
58 |
-
response = llm(prompt)[0]['generated_text']
|
59 |
-
if "Answer:" not in response:
|
60 |
-
return "Error: Unable to parse the model's response."
|
61 |
-
return response.split("Answer:")[-1].strip()
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
query_input.submit(
|
69 |
chat_with_transcript,
|
70 |
-
inputs=[query_input
|
71 |
outputs=[answer_output]
|
72 |
)
|
73 |
|
|
|
1 |
import gradio as gr
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
4 |
+
import faiss
|
5 |
+
import numpy as np
|
6 |
|
7 |
# Load models
|
8 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
9 |
+
model_name = "google/flan-t5-base"
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
12 |
+
llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
|
13 |
|
14 |
+
# Hardcoded transcript (5-10 lines)
|
15 |
+
transcript = """
|
16 |
+
The meeting started at 10 AM. The team discussed the new project timeline.
|
17 |
+
John mentioned that the deadline is tight but achievable. Sarah suggested adding more resources.
|
18 |
+
The team agreed to meet again tomorrow to finalize the plan.
|
19 |
+
"""
|
|
|
20 |
|
21 |
+
# Preprocess and chunk the transcript
|
22 |
+
def preprocess_transcript(text):
|
23 |
+
return ' '.join(text.split()) # Remove extra whitespace
|
|
|
24 |
|
25 |
+
def chunk_text(text, chunk_size=300, overlap=50):
|
26 |
+
words = text.split()
|
27 |
+
chunks = []
|
28 |
+
for i in range(0, len(words), chunk_size - overlap):
|
29 |
+
chunk = ' '.join(words[i:i + chunk_size])
|
30 |
+
chunks.append(chunk)
|
31 |
+
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
chunks = chunk_text(preprocess_transcript(transcript))
|
34 |
|
35 |
+
# Generate embeddings and create FAISS index
|
36 |
+
embeddings = embedder.encode(chunks)
|
37 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
38 |
+
index.add(np.array(embeddings))
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
# Query the FAISS index
|
41 |
+
def query_faiss(query, index, embedder, chunks, top_k=2):
|
42 |
+
query_vector = embedder.encode([query])
|
43 |
+
D, I = index.search(np.array(query_vector), top_k)
|
44 |
+
retrieved_chunks = [chunks[i] for i in I[0]]
|
45 |
+
return "\n\n".join(retrieved_chunks)
|
46 |
+
|
47 |
+
# Build prompt and generate answer
|
48 |
+
def chat_with_transcript(query):
|
49 |
+
context = query_faiss(query, index, embedder, chunks)
|
50 |
+
prompt = f"""You are an AI assistant. Use the following context to answer the question.
|
51 |
+
Context:
|
52 |
+
{context}
|
53 |
+
Question: {query}
|
54 |
+
Answer:"""
|
55 |
+
response = llm(prompt)[0]['generated_text']
|
56 |
+
if "Answer:" not in response:
|
57 |
+
return "Error: Unable to parse the model's response."
|
58 |
+
return response.split("Answer:")[-1].strip()
|
59 |
+
|
60 |
+
# Gradio interface
|
61 |
+
with gr.Blocks() as demo:
|
62 |
+
gr.Markdown("# 📄 Chat with a Transcript")
|
63 |
+
query_input = gr.Textbox(label="Ask a question about the transcript")
|
64 |
+
answer_output = gr.Textbox(label="Answer")
|
65 |
+
|
66 |
query_input.submit(
|
67 |
chat_with_transcript,
|
68 |
+
inputs=[query_input],
|
69 |
outputs=[answer_output]
|
70 |
)
|
71 |
|