Spaces:
Sleeping
Sleeping
File size: 1,351 Bytes
61e33cd eae9a85 61e33cd eae9a85 564aac1 eae9a85 564aac1 61e33cd 564aac1 61e33cd eae9a85 61e33cd 564aac1 61e33cd 564aac1 61e33cd 564aac1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import faiss
import numpy as np
import re
def preprocess_transcript(text):
"""
Preprocess the transcript by removing timestamps and speaker labels.
Example input: "[Speaker Guest-1 - 00:13] This is a test."
Example output: "This is a test."
"""
# Remove patterns like [Speaker Guest-X - HH:MM]
cleaned_text = re.sub(r'\[.*?\]', '', text)
# Remove extra whitespace
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text
def chunk_text(text, chunk_size=300, overlap=50):
# Preprocess the text to remove timestamps and speaker labels
text = preprocess_transcript(text)
if not text.strip():
raise ValueError("Transcript is empty after preprocessing.")
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def embed_chunks(chunks, embedder):
print(f"Embedding {len(chunks)} chunks...")
embeddings = embedder.encode(chunks)
return np.array(embeddings), chunks
def create_faiss_index(embeddings):
print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print("FAISS index created successfully.")
return index |