File size: 2,426 Bytes
80e4884
c1f6043
 
 
 
528c0a0
7cd4bfc
c1f6043
 
 
 
 
528c0a0
c1f6043
 
 
 
 
 
7cd4bfc
c1f6043
 
 
528c0a0
c1f6043
 
 
 
 
 
 
7cd4bfc
c1f6043
3b5783e
c1f6043
 
 
 
7cd4bfc
c1f6043
 
 
 
 
 
 
 
84176fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import faiss
import numpy as np

# Load models
embedder = SentenceTransformer("all-MiniLM-L6-v2")
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

# Hardcoded transcript (5-10 lines)
transcript = """
The meeting started at 10 AM. The team discussed the new project timeline.
John mentioned that the deadline is tight but achievable. Sarah suggested adding more resources.
The team agreed to meet again tomorrow to finalize the plan.
"""

# Preprocess and chunk the transcript
def preprocess_transcript(text):
    return ' '.join(text.split())  # Remove extra whitespace

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_text(preprocess_transcript(transcript))

# Generate embeddings and create FAISS index
embeddings = embedder.encode(chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# Query the FAISS index
def query_faiss(query, index, embedder, chunks, top_k=2):
    query_vector = embedder.encode([query])
    D, I = index.search(np.array(query_vector), top_k)
    retrieved_chunks = [chunks[i] for i in I[0]]
    return "\n\n".join(retrieved_chunks)

# Build prompt and generate answer
def chat_with_transcript(query):
    context = query_faiss(query, index, embedder, chunks)
    prompt = f"""You are an AI assistant. Use the following context to answer the question.
Context:
{context}
Question: {query}
Provide your answer below:
"""
    response = llm(prompt)[0]['generated_text']
    print("Raw model response:", response)  # Debug statement
    return response.strip()

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 📄 Chat with a Transcript")
    query_input = gr.Textbox(label="Ask a question about the transcript")
    answer_output = gr.Textbox(label="Answer")
    
    query_input.submit(
        chat_with_transcript,
        inputs=[query_input],
        outputs=[answer_output]
    )

demo.launch()