Alimubariz124 commited on
Commit
eae9a85
·
verified ·
1 Parent(s): 7cd4bfc

Update transcript_handler.py

Browse files
Files changed (1) hide show
  1. transcript_handler.py +19 -1
transcript_handler.py CHANGED
@@ -1,9 +1,26 @@
1
  import faiss
2
  import numpy as np
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def chunk_text(text, chunk_size=300, overlap=50):
 
 
 
5
  if not text.strip():
6
- raise ValueError("Transcript is empty.")
7
 
8
  words = text.split()
9
  chunks = []
@@ -12,6 +29,7 @@ def chunk_text(text, chunk_size=300, overlap=50):
12
  chunks.append(chunk)
13
  return chunks
14
 
 
15
  def embed_chunks(chunks, embedder):
16
  print(f"Embedding {len(chunks)} chunks...")
17
  embeddings = embedder.encode(chunks)
 
1
  import faiss
2
  import numpy as np
3
 
4
+ import re
5
+
6
+ def preprocess_transcript(text):
7
+ """
8
+ Preprocess the transcript by removing timestamps and speaker labels.
9
+ Example input: "[Speaker Guest-1 - 00:13] This is a test."
10
+ Example output: "This is a test."
11
+ """
12
+ # Remove patterns like [Speaker Guest-X - HH:MM]
13
+ cleaned_text = re.sub(r'\[.*?\]', '', text)
14
+ # Remove extra whitespace
15
+ cleaned_text = ' '.join(cleaned_text.split())
16
+ return cleaned_text
17
+
18
  def chunk_text(text, chunk_size=300, overlap=50):
19
+ # Preprocess the text to remove timestamps and speaker labels
20
+ text = preprocess_transcript(text)
21
+
22
  if not text.strip():
23
+ raise ValueError("Transcript is empty after preprocessing.")
24
 
25
  words = text.split()
26
  chunks = []
 
29
  chunks.append(chunk)
30
  return chunks
31
 
32
+
33
  def embed_chunks(chunks, embedder):
34
  print(f"Embedding {len(chunks)} chunks...")
35
  embeddings = embedder.encode(chunks)