Spaces:
Sleeping
Sleeping
Create transcript_handler.py
Browse files- transcript_handler.py +19 -0
transcript_handler.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def chunk_text(text, chunk_size=300, overlap=50):
|
5 |
+
words = text.split()
|
6 |
+
chunks = []
|
7 |
+
for i in range(0, len(words), chunk_size - overlap):
|
8 |
+
chunk = ' '.join(words[i:i+chunk_size])
|
9 |
+
chunks.append(chunk)
|
10 |
+
return chunks
|
11 |
+
|
12 |
+
def embed_chunks(chunks, embedder):
|
13 |
+
embeddings = embedder.encode(chunks)
|
14 |
+
return np.array(embeddings), chunks
|
15 |
+
|
16 |
+
def create_faiss_index(embeddings):
|
17 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
18 |
+
index.add(embeddings)
|
19 |
+
return index
|