philtoms commited on
Commit
45384e6
·
verified ·
1 Parent(s): 520f5a2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +41 -41
  2. requirements.txt +1 -0
app.py CHANGED
@@ -49,45 +49,45 @@ with open(data_path, "r") as f:
49
  # Pre-compute corpus embeddings
50
  import re
51
 
52
- def split_into_sentences(text):
53
- """Splits a paragraph into sentences based on capitalization and punctuation."""
54
- # This regex looks for a capital letter, followed by anything that's not a period,
55
- # exclamation mark, or question mark, and then ends with one of those punctuation marks.
56
- sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
57
- return sentences
58
-
59
- def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
60
- chunked_corpus = []
61
- for doc_idx, doc_text in enumerate(corpus_documents):
62
- sentences = split_into_sentences(doc_text)
63
- if not sentences:
64
- continue
65
-
66
- # If there are fewer sentences than chunk_size, just use the whole document as one chunk
67
- if len(sentences) < chunk_size:
68
- chunked_corpus.append({
69
- "text": doc_text,
70
- "original_doc_idx": doc_idx,
71
- "start_sentence_idx": 0,
72
- "end_sentence_idx": len(sentences) - 1
73
- })
74
- continue
75
-
76
- for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
77
- chunk_sentences = sentences[i : i + chunk_size]
78
- chunk_text = " ".join(chunk_sentences)
79
- chunked_corpus.append({
80
- "text": chunk_text,
81
- "original_doc_idx": doc_idx,
82
- "start_sentence_idx": i,
83
- "end_sentence_idx": i + chunk_size - 1
84
- })
85
- return chunked_corpus
86
-
87
- def process_documents_for_chunking(documents):
88
- chunked_corpus_data = create_overlapped_chunks(documents)
89
- flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
90
- return chunked_corpus_data, flat_corpus_chunks
91
 
92
  # Pre-compute corpus embeddings
93
  original_corpus = [item["positive"] for item in dataset]
@@ -147,8 +147,8 @@ def find_similar(prompt, top_k):
147
  end_time = time.time()
148
 
149
  results = []
150
- # for doc, score in doc_score_pairs[:top_k]:
151
- for doc, score in doc_score_pairs:
152
  results.append((score, doc))
153
 
154
  return results, f"{(end_time - start_time) * 1000:.2f} ms"
 
49
  # Pre-compute corpus embeddings
50
  import re
51
 
52
+ # def split_into_sentences(text):
53
+ # """Splits a paragraph into sentences based on capitalization and punctuation."""
54
+ # # This regex looks for a capital letter, followed by anything that's not a period,
55
+ # # exclamation mark, or question mark, and then ends with one of those punctuation marks.
56
+ # sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
57
+ # return sentences
58
+
59
+ # def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
60
+ # chunked_corpus = []
61
+ # for doc_idx, doc_text in enumerate(corpus_documents):
62
+ # sentences = split_into_sentences(doc_text)
63
+ # if not sentences:
64
+ # continue
65
+
66
+ # # If there are fewer sentences than chunk_size, just use the whole document as one chunk
67
+ # if len(sentences) < chunk_size:
68
+ # chunked_corpus.append({
69
+ # "text": doc_text,
70
+ # "original_doc_idx": doc_idx,
71
+ # "start_sentence_idx": 0,
72
+ # "end_sentence_idx": len(sentences) - 1
73
+ # })
74
+ # continue
75
+
76
+ # for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
77
+ # chunk_sentences = sentences[i : i + chunk_size]
78
+ # chunk_text = " ".join(chunk_sentences)
79
+ # chunked_corpus.append({
80
+ # "text": chunk_text,
81
+ # "original_doc_idx": doc_idx,
82
+ # "start_sentence_idx": i,
83
+ # "end_sentence_idx": i + chunk_size - 1
84
+ # })
85
+ # return chunked_corpus
86
+
87
+ # def process_documents_for_chunking(documents):
88
+ # chunked_corpus_data = create_overlapped_chunks(documents)
89
+ # flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
90
+ # return chunked_corpus_data, flat_corpus_chunks
91
 
92
  # Pre-compute corpus embeddings
93
  original_corpus = [item["positive"] for item in dataset]
 
147
  end_time = time.time()
148
 
149
  results = []
150
+ for doc, score in doc_score_pairs[:top_k]:
151
+ # for doc, score in doc_score_pairs:
152
  results.append((score, doc))
153
 
154
  return results, f"{(end_time - start_time) * 1000:.2f} ms"
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
2
  transformers
 
3
  torch
4
  huggingface_hub
 
1
  gradio
2
  transformers
3
+ sentence_transformers
4
  torch
5
  huggingface_hub