import faiss import numpy as np import re def preprocess_transcript(text): """ Preprocess the transcript by removing timestamps and speaker labels. Example input: "[Speaker Guest-1 - 00:13] This is a test." Example output: "This is a test." """ # Remove patterns like [Speaker Guest-X - HH:MM] cleaned_text = re.sub(r'\[.*?\]', '', text) # Remove extra whitespace cleaned_text = ' '.join(cleaned_text.split()) return cleaned_text def chunk_text(text, chunk_size=300, overlap=50): # Preprocess the text to remove timestamps and speaker labels text = preprocess_transcript(text) if not text.strip(): raise ValueError("Transcript is empty after preprocessing.") words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) return chunks def embed_chunks(chunks, embedder): print(f"Embedding {len(chunks)} chunks...") embeddings = embedder.encode(chunks) return np.array(embeddings), chunks def create_faiss_index(embeddings): print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...") index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) print("FAISS index created successfully.") return index