Spaces:
Sleeping
Sleeping
Update transcript_handler.py
Browse files- transcript_handler.py +19 -1
transcript_handler.py
CHANGED
@@ -1,9 +1,26 @@
|
|
1 |
import faiss
|
2 |
import numpy as np
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def chunk_text(text, chunk_size=300, overlap=50):
|
|
|
|
|
|
|
5 |
if not text.strip():
|
6 |
-
raise ValueError("Transcript is empty.")
|
7 |
|
8 |
words = text.split()
|
9 |
chunks = []
|
@@ -12,6 +29,7 @@ def chunk_text(text, chunk_size=300, overlap=50):
|
|
12 |
chunks.append(chunk)
|
13 |
return chunks
|
14 |
|
|
|
15 |
def embed_chunks(chunks, embedder):
|
16 |
print(f"Embedding {len(chunks)} chunks...")
|
17 |
embeddings = embedder.encode(chunks)
|
|
|
1 |
import faiss
|
2 |
import numpy as np
|
3 |
|
4 |
+
import re
|
5 |
+
|
6 |
+
def preprocess_transcript(text):
|
7 |
+
"""
|
8 |
+
Preprocess the transcript by removing timestamps and speaker labels.
|
9 |
+
Example input: "[Speaker Guest-1 - 00:13] This is a test."
|
10 |
+
Example output: "This is a test."
|
11 |
+
"""
|
12 |
+
# Remove patterns like [Speaker Guest-X - HH:MM]
|
13 |
+
cleaned_text = re.sub(r'\[.*?\]', '', text)
|
14 |
+
# Remove extra whitespace
|
15 |
+
cleaned_text = ' '.join(cleaned_text.split())
|
16 |
+
return cleaned_text
|
17 |
+
|
18 |
def chunk_text(text, chunk_size=300, overlap=50):
|
19 |
+
# Preprocess the text to remove timestamps and speaker labels
|
20 |
+
text = preprocess_transcript(text)
|
21 |
+
|
22 |
if not text.strip():
|
23 |
+
raise ValueError("Transcript is empty after preprocessing.")
|
24 |
|
25 |
words = text.split()
|
26 |
chunks = []
|
|
|
29 |
chunks.append(chunk)
|
30 |
return chunks
|
31 |
|
32 |
+
|
33 |
def embed_chunks(chunks, embedder):
|
34 |
print(f"Embedding {len(chunks)} chunks...")
|
35 |
embeddings = embedder.encode(chunks)
|