File size: 1,351 Bytes
61e33cd
 
 
eae9a85
 
 
 
 
 
 
 
 
 
 
 
 
 
61e33cd
eae9a85
 
 
564aac1
eae9a85
564aac1
61e33cd
 
 
564aac1
61e33cd
 
 
eae9a85
61e33cd
564aac1
61e33cd
 
 
 
564aac1
61e33cd
 
564aac1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import faiss
import numpy as np

import re

def preprocess_transcript(text):
    """
    Preprocess the transcript by removing timestamps and speaker labels.
    Example input: "[Speaker Guest-1 - 00:13] This is a test."
    Example output: "This is a test."
    """
    # Remove patterns like [Speaker Guest-X - HH:MM]
    cleaned_text = re.sub(r'\[.*?\]', '', text)
    # Remove extra whitespace
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text
    
def chunk_text(text, chunk_size=300, overlap=50):
    # Preprocess the text to remove timestamps and speaker labels
    text = preprocess_transcript(text)
    
    if not text.strip():
        raise ValueError("Transcript is empty after preprocessing.")
    
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks


def embed_chunks(chunks, embedder):
    print(f"Embedding {len(chunks)} chunks...")
    embeddings = embedder.encode(chunks)
    return np.array(embeddings), chunks

def create_faiss_index(embeddings):
    print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...")
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    print("FAISS index created successfully.")
    return index