Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +41 -41
- requirements.txt +1 -0
app.py
CHANGED
@@ -49,45 +49,45 @@ with open(data_path, "r") as f:
|
|
49 |
# Pre-compute corpus embeddings
|
50 |
import re
|
51 |
|
52 |
-
def split_into_sentences(text):
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
def process_documents_for_chunking(documents):
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
|
92 |
# Pre-compute corpus embeddings
|
93 |
original_corpus = [item["positive"] for item in dataset]
|
@@ -147,8 +147,8 @@ def find_similar(prompt, top_k):
|
|
147 |
end_time = time.time()
|
148 |
|
149 |
results = []
|
150 |
-
|
151 |
-
for doc, score in doc_score_pairs:
|
152 |
results.append((score, doc))
|
153 |
|
154 |
return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
|
|
49 |
# Pre-compute corpus embeddings
|
50 |
import re
|
51 |
|
52 |
+
# def split_into_sentences(text):
|
53 |
+
# """Splits a paragraph into sentences based on capitalization and punctuation."""
|
54 |
+
# # This regex looks for a capital letter, followed by anything that's not a period,
|
55 |
+
# # exclamation mark, or question mark, and then ends with one of those punctuation marks.
|
56 |
+
# sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
|
57 |
+
# return sentences
|
58 |
+
|
59 |
+
# def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
|
60 |
+
# chunked_corpus = []
|
61 |
+
# for doc_idx, doc_text in enumerate(corpus_documents):
|
62 |
+
# sentences = split_into_sentences(doc_text)
|
63 |
+
# if not sentences:
|
64 |
+
# continue
|
65 |
+
|
66 |
+
# # If there are fewer sentences than chunk_size, just use the whole document as one chunk
|
67 |
+
# if len(sentences) < chunk_size:
|
68 |
+
# chunked_corpus.append({
|
69 |
+
# "text": doc_text,
|
70 |
+
# "original_doc_idx": doc_idx,
|
71 |
+
# "start_sentence_idx": 0,
|
72 |
+
# "end_sentence_idx": len(sentences) - 1
|
73 |
+
# })
|
74 |
+
# continue
|
75 |
+
|
76 |
+
# for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
|
77 |
+
# chunk_sentences = sentences[i : i + chunk_size]
|
78 |
+
# chunk_text = " ".join(chunk_sentences)
|
79 |
+
# chunked_corpus.append({
|
80 |
+
# "text": chunk_text,
|
81 |
+
# "original_doc_idx": doc_idx,
|
82 |
+
# "start_sentence_idx": i,
|
83 |
+
# "end_sentence_idx": i + chunk_size - 1
|
84 |
+
# })
|
85 |
+
# return chunked_corpus
|
86 |
+
|
87 |
+
# def process_documents_for_chunking(documents):
|
88 |
+
# chunked_corpus_data = create_overlapped_chunks(documents)
|
89 |
+
# flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
|
90 |
+
# return chunked_corpus_data, flat_corpus_chunks
|
91 |
|
92 |
# Pre-compute corpus embeddings
|
93 |
original_corpus = [item["positive"] for item in dataset]
|
|
|
147 |
end_time = time.time()
|
148 |
|
149 |
results = []
|
150 |
+
for doc, score in doc_score_pairs[:top_k]:
|
151 |
+
# for doc, score in doc_score_pairs:
|
152 |
results.append((score, doc))
|
153 |
|
154 |
return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
gradio
|
2 |
transformers
|
|
|
3 |
torch
|
4 |
huggingface_hub
|
|
|
1 |
gradio
|
2 |
transformers
|
3 |
+
sentence_transformers
|
4 |
torch
|
5 |
huggingface_hub
|