chat-with-data / transcript_handler.py
Alimubariz124's picture
Update transcript_handler.py
eae9a85 verified
import faiss
import numpy as np
import re
def preprocess_transcript(text):
"""
Preprocess the transcript by removing timestamps and speaker labels.
Example input: "[Speaker Guest-1 - 00:13] This is a test."
Example output: "This is a test."
"""
# Remove patterns like [Speaker Guest-X - HH:MM]
cleaned_text = re.sub(r'\[.*?\]', '', text)
# Remove extra whitespace
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text
def chunk_text(text, chunk_size=300, overlap=50):
# Preprocess the text to remove timestamps and speaker labels
text = preprocess_transcript(text)
if not text.strip():
raise ValueError("Transcript is empty after preprocessing.")
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def embed_chunks(chunks, embedder):
print(f"Embedding {len(chunks)} chunks...")
embeddings = embedder.encode(chunks)
return np.array(embeddings), chunks
def create_faiss_index(embeddings):
print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print("FAISS index created successfully.")
return index