Spaces:

Alimubariz124
/

chat-with-data

Sleeping

Alimubariz124 commited on Apr 24

Commit

eae9a85

verified ·

1 Parent(s): 7cd4bfc

Update transcript_handler.py

Files changed (1) hide show

transcript_handler.py CHANGED Viewed

@@ -1,9 +1,26 @@
 import faiss
 import numpy as np
 def chunk_text(text, chunk_size=300, overlap=50):
     if not text.strip():
-        raise ValueError("Transcript is empty.")
     words = text.split()
     chunks = []
@@ -12,6 +29,7 @@ def chunk_text(text, chunk_size=300, overlap=50):
         chunks.append(chunk)
     return chunks
 def embed_chunks(chunks, embedder):
     print(f"Embedding {len(chunks)} chunks...")
     embeddings = embedder.encode(chunks)

 import faiss
 import numpy as np
+import re
+def preprocess_transcript(text):
+    """
+    Preprocess the transcript by removing timestamps and speaker labels.
+    Example input: "[Speaker Guest-1 - 00:13] This is a test."
+    Example output: "This is a test."
+    """
+    # Remove patterns like [Speaker Guest-X - HH:MM]
+    cleaned_text = re.sub(r'\[.*?\]', '', text)
+    # Remove extra whitespace
+    cleaned_text = ' '.join(cleaned_text.split())
+    return cleaned_text
 def chunk_text(text, chunk_size=300, overlap=50):
+    # Preprocess the text to remove timestamps and speaker labels
+    text = preprocess_transcript(text)
     if not text.strip():
+        raise ValueError("Transcript is empty after preprocessing.")
     words = text.split()
     chunks = []
         chunks.append(chunk)
     return chunks
 def embed_chunks(chunks, embedder):
     print(f"Embedding {len(chunks)} chunks...")
     embeddings = embedder.encode(chunks)