Spaces:
Sleeping
Sleeping
import faiss | |
import numpy as np | |
import re | |
def preprocess_transcript(text): | |
""" | |
Preprocess the transcript by removing timestamps and speaker labels. | |
Example input: "[Speaker Guest-1 - 00:13] This is a test." | |
Example output: "This is a test." | |
""" | |
# Remove patterns like [Speaker Guest-X - HH:MM] | |
cleaned_text = re.sub(r'\[.*?\]', '', text) | |
# Remove extra whitespace | |
cleaned_text = ' '.join(cleaned_text.split()) | |
return cleaned_text | |
def chunk_text(text, chunk_size=300, overlap=50): | |
# Preprocess the text to remove timestamps and speaker labels | |
text = preprocess_transcript(text) | |
if not text.strip(): | |
raise ValueError("Transcript is empty after preprocessing.") | |
words = text.split() | |
chunks = [] | |
for i in range(0, len(words), chunk_size - overlap): | |
chunk = ' '.join(words[i:i + chunk_size]) | |
chunks.append(chunk) | |
return chunks | |
def embed_chunks(chunks, embedder): | |
print(f"Embedding {len(chunks)} chunks...") | |
embeddings = embedder.encode(chunks) | |
return np.array(embeddings), chunks | |
def create_faiss_index(embeddings): | |
print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...") | |
index = faiss.IndexFlatL2(embeddings.shape[1]) | |
index.add(embeddings) | |
print("FAISS index created successfully.") | |
return index |