Spaces:
Sleeping
Sleeping
Update transcript_handler.py
Browse files- transcript_handler.py +8 -2
transcript_handler.py
CHANGED
@@ -2,18 +2,24 @@ import faiss
|
|
2 |
import numpy as np
|
3 |
|
4 |
def chunk_text(text, chunk_size=300, overlap=50):
|
|
|
|
|
|
|
5 |
words = text.split()
|
6 |
chunks = []
|
7 |
for i in range(0, len(words), chunk_size - overlap):
|
8 |
-
chunk = ' '.join(words[i:i+chunk_size])
|
9 |
chunks.append(chunk)
|
10 |
return chunks
|
11 |
|
12 |
def embed_chunks(chunks, embedder):
|
|
|
13 |
embeddings = embedder.encode(chunks)
|
14 |
return np.array(embeddings), chunks
|
15 |
|
16 |
def create_faiss_index(embeddings):
|
|
|
17 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
18 |
index.add(embeddings)
|
19 |
-
|
|
|
|
2 |
import numpy as np
|
3 |
|
4 |
def chunk_text(text, chunk_size=300, overlap=50):
|
5 |
+
if not text.strip():
|
6 |
+
raise ValueError("Transcript is empty.")
|
7 |
+
|
8 |
words = text.split()
|
9 |
chunks = []
|
10 |
for i in range(0, len(words), chunk_size - overlap):
|
11 |
+
chunk = ' '.join(words[i:i + chunk_size])
|
12 |
chunks.append(chunk)
|
13 |
return chunks
|
14 |
|
15 |
def embed_chunks(chunks, embedder):
|
16 |
+
print(f"Embedding {len(chunks)} chunks...")
|
17 |
embeddings = embedder.encode(chunks)
|
18 |
return np.array(embeddings), chunks
|
19 |
|
20 |
def create_faiss_index(embeddings):
|
21 |
+
print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...")
|
22 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
23 |
index.add(embeddings)
|
24 |
+
print("FAISS index created successfully.")
|
25 |
+
return index
|