VyLala commited on
Commit
218ff19
Β·
verified Β·
1 Parent(s): 86d55b0

Update data_preprocess.py

Browse files
Files changed (1) hide show
  1. data_preprocess.py +746 -745
data_preprocess.py CHANGED
@@ -1,746 +1,747 @@
1
- import re
2
- import os
3
- #import streamlit as st
4
- import subprocess
5
- import re
6
- from Bio import Entrez
7
- from docx import Document
8
- import fitz
9
- import spacy
10
- from spacy.cli import download
11
- from NER.PDF import pdf
12
- from NER.WordDoc import wordDoc
13
- from NER.html import extractHTML
14
- from NER.word2Vec import word2vec
15
- #from transformers import pipeline
16
- import urllib.parse, requests
17
- from pathlib import Path
18
- import pandas as pd
19
- import model
20
- import pipeline
21
- import tempfile
22
- import nltk
23
- nltk.download('punkt_tab')
24
- def download_excel_file(url, save_path="temp.xlsx"):
25
- if "view.officeapps.live.com" in url:
26
- parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
27
- real_url = urllib.parse.unquote(parsed_url["src"][0])
28
- response = requests.get(real_url)
29
- with open(save_path, "wb") as f:
30
- f.write(response.content)
31
- return save_path
32
- elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
33
- response = requests.get(url)
34
- response.raise_for_status() # Raises error if download fails
35
- with open(save_path, "wb") as f:
36
- f.write(response.content)
37
- print(len(response.content))
38
- return save_path
39
- else:
40
- print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
41
- return url
42
- def extract_text(link,saveFolder):
43
- try:
44
- text = ""
45
- name = link.split("/")[-1]
46
- print("name: ", name)
47
- #file_path = Path(saveFolder) / name
48
- local_temp_path = os.path.join(tempfile.gettempdir(), name)
49
- print("this is local temp path: ", local_temp_path)
50
- if os.path.exists(local_temp_path):
51
- input_to_class = local_temp_path
52
- print("exist")
53
- else:
54
- #input_to_class = link # Let the class handle downloading
55
- # 1. Check if file exists in shared Google Drive folder
56
- file_id = pipeline.find_drive_file(name, saveFolder)
57
- if file_id:
58
- print("πŸ“₯ Downloading from Google Drive...")
59
- pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
60
- else:
61
- print("🌐 Downloading from web link...")
62
- response = requests.get(link)
63
- with open(local_temp_path, 'wb') as f:
64
- f.write(response.content)
65
- print("βœ… Saved locally.")
66
-
67
- # 2. Upload to Drive so it's available for later
68
- pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
69
-
70
- input_to_class = local_temp_path
71
- print(input_to_class)
72
- # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
73
- # pdf
74
- if link.endswith(".pdf"):
75
- # if file_path.is_file():
76
- # link = saveFolder + "/" + name
77
- # print("File exists.")
78
- #p = pdf.PDF(local_temp_path, saveFolder)
79
- print("inside pdf and input to class: ", input_to_class)
80
- print("save folder in extract text: ", saveFolder)
81
- p = pdf.PDF(input_to_class, saveFolder)
82
- #p = pdf.PDF(link,saveFolder)
83
- text = p.extractTextWithPDFReader()
84
- print("text from pdf:")
85
- print(text)
86
- #text_exclude_table = p.extract_text_excluding_tables()
87
- # worddoc
88
- elif link.endswith(".doc") or link.endswith(".docx"):
89
- #d = wordDoc.wordDoc(local_temp_path,saveFolder)
90
- d = wordDoc.wordDoc(input_to_class,saveFolder)
91
- text = d.extractTextByPage()
92
- # html
93
- else:
94
- if link.split(".")[-1].lower() not in "xlsx":
95
- if "http" in link or "html" in link:
96
- print("html link: ", link)
97
- html = extractHTML.HTML("",link)
98
- text = html.getListSection() # the text already clean
99
- print("text html: ")
100
- print(text)
101
- # Cleanup: delete the local temp file
102
- if name:
103
- if os.path.exists(local_temp_path):
104
- os.remove(local_temp_path)
105
- print(f"🧹 Deleted local temp file: {local_temp_path}")
106
- print("done extract text")
107
- except:
108
- text = ""
109
- return text
110
-
111
- def extract_table(link,saveFolder):
112
- try:
113
- table = []
114
- name = link.split("/")[-1]
115
- #file_path = Path(saveFolder) / name
116
- local_temp_path = os.path.join(tempfile.gettempdir(), name)
117
- if os.path.exists(local_temp_path):
118
- input_to_class = local_temp_path
119
- print("exist")
120
- else:
121
- #input_to_class = link # Let the class handle downloading
122
- # 1. Check if file exists in shared Google Drive folder
123
- file_id = pipeline.find_drive_file(name, saveFolder)
124
- if file_id:
125
- print("πŸ“₯ Downloading from Google Drive...")
126
- pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
127
- else:
128
- print("🌐 Downloading from web link...")
129
- response = requests.get(link)
130
- with open(local_temp_path, 'wb') as f:
131
- f.write(response.content)
132
- print("βœ… Saved locally.")
133
-
134
- # 2. Upload to Drive so it's available for later
135
- pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
136
-
137
- input_to_class = local_temp_path
138
- print(input_to_class)
139
- #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
140
- # pdf
141
- if link.endswith(".pdf"):
142
- # if file_path.is_file():
143
- # link = saveFolder + "/" + name
144
- # print("File exists.")
145
- #p = pdf.PDF(local_temp_path,saveFolder)
146
- p = pdf.PDF(input_to_class,saveFolder)
147
- table = p.extractTable()
148
- # worddoc
149
- elif link.endswith(".doc") or link.endswith(".docx"):
150
- #d = wordDoc.wordDoc(local_temp_path,saveFolder)
151
- d = wordDoc.wordDoc(input_to_class,saveFolder)
152
- table = d.extractTableAsList()
153
- # excel
154
- elif link.split(".")[-1].lower() in "xlsx":
155
- # download excel file if it not downloaded yet
156
- savePath = saveFolder +"/"+ link.split("/")[-1]
157
- excelPath = download_excel_file(link, savePath)
158
- try:
159
- #xls = pd.ExcelFile(excelPath)
160
- xls = pd.ExcelFile(local_temp_path)
161
- table_list = []
162
- for sheet_name in xls.sheet_names:
163
- df = pd.read_excel(xls, sheet_name=sheet_name)
164
- cleaned_table = df.fillna("").astype(str).values.tolist()
165
- table_list.append(cleaned_table)
166
- table = table_list
167
- except Exception as e:
168
- print("❌ Failed to extract tables from Excel:", e)
169
- # html
170
- elif "http" in link or "html" in link:
171
- html = extractHTML.HTML("",link)
172
- table = html.extractTable() # table is a list
173
- table = clean_tables_format(table)
174
- # Cleanup: delete the local temp file
175
- if os.path.exists(local_temp_path):
176
- os.remove(local_temp_path)
177
- print(f"🧹 Deleted local temp file: {local_temp_path}")
178
- except:
179
- table = []
180
- return table
181
-
182
- def clean_tables_format(tables):
183
- """
184
- Ensures all tables are in consistent format: List[List[List[str]]]
185
- Cleans by:
186
- - Removing empty strings and rows
187
- - Converting all cells to strings
188
- - Handling DataFrames and list-of-lists
189
- """
190
- cleaned = []
191
- if tables:
192
- for table in tables:
193
- standardized = []
194
-
195
- # Case 1: Pandas DataFrame
196
- if isinstance(table, pd.DataFrame):
197
- table = table.fillna("").astype(str).values.tolist()
198
-
199
- # Case 2: List of Lists
200
- if isinstance(table, list) and all(isinstance(row, list) for row in table):
201
- for row in table:
202
- filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
203
- if filtered_row:
204
- standardized.append(filtered_row)
205
-
206
- if standardized:
207
- cleaned.append(standardized)
208
-
209
- return cleaned
210
-
211
- import json
212
- def normalize_text_for_comparison(s: str) -> str:
213
- """
214
- Normalizes text for robust comparison by:
215
- 1. Converting to lowercase.
216
- 2. Replacing all types of newlines with a single consistent newline (\n).
217
- 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
218
- 4. Stripping leading/trailing whitespace from the entire string.
219
- """
220
- s = s.lower()
221
- s = s.replace('\r\n', '\n') # Handle Windows newlines
222
- s = s.replace('\r', '\n') # Handle Mac classic newlines
223
-
224
- # Replace sequences of whitespace (including multiple newlines) with a single space
225
- # This might be too aggressive if you need to preserve paragraph breaks,
226
- # but good for exact word-sequence matching.
227
- s = re.sub(r'\s+', ' ', s)
228
-
229
- return s.strip()
230
- def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
231
- """
232
- Merge cleaned text and table into one string for LLM input.
233
- - Avoids duplicating tables already in text
234
- - Extracts only relevant rows from large tables
235
- - Skips or saves oversized tables
236
- """
237
- import importlib
238
- json = importlib.import_module("json")
239
-
240
- def estimate_tokens(text_str):
241
- try:
242
- enc = tiktoken.get_encoding(tokenizer)
243
- return len(enc.encode(text_str))
244
- except:
245
- return len(text_str) // 4 # Fallback estimate
246
-
247
- def is_table_relevant(table, keywords, accession_id=None):
248
- flat = " ".join(" ".join(row).lower() for row in table)
249
- if accession_id and accession_id.lower() in flat:
250
- return True
251
- return any(kw.lower() in flat for kw in keywords)
252
- preview, preview1 = "",""
253
- llm_input = "## Document Text\n" + text.strip() + "\n"
254
- clean_text = normalize_text_for_comparison(text)
255
-
256
- if tables:
257
- for idx, table in enumerate(tables):
258
- keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
259
- if accession_id: keywords += [accession_id.lower()]
260
- if isolate: keywords += [isolate.lower()]
261
- if is_table_relevant(table, keywords, accession_id):
262
- if len(table) > 0:
263
- for tab in table:
264
- preview = " ".join(tab) if tab else ""
265
- preview1 = "\n".join(tab) if tab else ""
266
- clean_preview = normalize_text_for_comparison(preview)
267
- clean_preview1 = normalize_text_for_comparison(preview1)
268
- if clean_preview not in clean_text:
269
- if clean_preview1 not in clean_text:
270
- table_str = json.dumps([tab], indent=2)
271
- llm_input += f"## Table {idx+1}\n{table_str}\n"
272
- return llm_input.strip()
273
-
274
- def preprocess_document(link, saveFolder, accession=None, isolate=None):
275
- try:
276
- text = extract_text(link, saveFolder)
277
- print("text and link")
278
- print(link)
279
- print(text)
280
- except: text = ""
281
- try:
282
- tables = extract_table(link, saveFolder)
283
- except: tables = []
284
- if accession: accession = accession
285
- if isolate: isolate = isolate
286
- try:
287
- final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
288
- except: final_input = ""
289
- return text, tables, final_input
290
-
291
- def extract_sentences(text):
292
- sentences = re.split(r'(?<=[.!?])\s+', text)
293
- return [s.strip() for s in sentences if s.strip()]
294
-
295
- def is_irrelevant_number_sequence(text):
296
- if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
297
- return False
298
- word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
299
- number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
300
- total_tokens = len(re.findall(r'\S+', text))
301
- if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
302
- return True
303
- elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
304
- return True
305
- return False
306
-
307
- def remove_isolated_single_digits(sentence):
308
- tokens = sentence.split()
309
- filtered_tokens = []
310
- for token in tokens:
311
- if token == '0' or token == '1':
312
- pass
313
- else:
314
- filtered_tokens.append(token)
315
- return ' '.join(filtered_tokens).strip()
316
-
317
- def get_contextual_sentences_BFS(text_content, keyword, depth=2):
318
- def extract_codes(sentence):
319
- # Match codes like 'A1YU101', 'KM1', 'MO6' β€” at least 2 letters + numbers
320
- return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
321
- sentences = extract_sentences(text_content)
322
- relevant_sentences = set()
323
- initial_keywords = set()
324
-
325
- # Define a regex to capture codes like A1YU101 or KM1
326
- # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
327
- code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
328
-
329
- # Attempt to parse the keyword into its prefix and numerical part using re.search
330
- keyword_match = code_pattern.search(keyword)
331
-
332
- keyword_prefix = None
333
- keyword_num = None
334
-
335
- if keyword_match:
336
- keyword_prefix = keyword_match.group(1).lower()
337
- keyword_num = int(keyword_match.group(2))
338
-
339
- for sentence in sentences:
340
- sentence_added = False
341
-
342
- # 1. Check for exact match of the keyword
343
- if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
344
- relevant_sentences.add(sentence.strip())
345
- initial_keywords.add(keyword.lower())
346
- sentence_added = True
347
-
348
- # 2. Check for range patterns (e.g., A1YU101-A1YU137)
349
- # The range pattern should be broad enough to capture the full code string within the range.
350
- range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
351
- range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
352
-
353
- for r_match in range_matches:
354
- start_code_str = r_match.group(1)
355
- end_code_str = r_match.group(2)
356
-
357
- # CRITICAL FIX: Use code_pattern.search for start_match and end_match
358
- start_match = code_pattern.search(start_code_str)
359
- end_match = code_pattern.search(end_code_str)
360
-
361
- if keyword_prefix and keyword_num is not None and start_match and end_match:
362
- start_prefix = start_match.group(1).lower()
363
- end_prefix = end_match.group(1).lower()
364
- start_num = int(start_match.group(2))
365
- end_num = int(end_match.group(2))
366
-
367
- # Check if the keyword's prefix matches and its number is within the range
368
- if keyword_prefix == start_prefix and \
369
- keyword_prefix == end_prefix and \
370
- start_num <= keyword_num <= end_num:
371
- relevant_sentences.add(sentence.strip())
372
- initial_keywords.add(start_code_str.lower())
373
- initial_keywords.add(end_code_str.lower())
374
- sentence_added = True
375
- break # Only need to find one matching range per sentence
376
-
377
- # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
378
- # to initial_keywords to ensure graph traversal from related terms.
379
- if sentence_added:
380
- for word in extract_codes(sentence):
381
- initial_keywords.add(word.lower())
382
-
383
-
384
- # Build word_to_sentences mapping for all sentences
385
- word_to_sentences = {}
386
- for sent in sentences:
387
- codes_in_sent = set(extract_codes(sent))
388
- for code in codes_in_sent:
389
- word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
390
-
391
-
392
- # Build the graph
393
- graph = {}
394
- for sent in sentences:
395
- codes = set(extract_codes(sent))
396
- for word1 in codes:
397
- word1_lower = word1.lower()
398
- graph.setdefault(word1_lower, set())
399
- for word2 in codes:
400
- word2_lower = word2.lower()
401
- if word1_lower != word2_lower:
402
- graph[word1_lower].add(word2_lower)
403
-
404
-
405
- # Perform BFS/graph traversal
406
- queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
407
- visited_words = set(initial_keywords)
408
-
409
- while queue:
410
- current_word, level = queue.pop(0)
411
- if level >= depth:
412
- continue
413
-
414
- relevant_sentences.update(word_to_sentences.get(current_word, []))
415
-
416
- for neighbor in graph.get(current_word, []):
417
- if neighbor not in visited_words:
418
- visited_words.add(neighbor)
419
- queue.append((neighbor, level + 1))
420
-
421
- final_sentences = set()
422
- for sentence in relevant_sentences:
423
- if not is_irrelevant_number_sequence(sentence):
424
- processed_sentence = remove_isolated_single_digits(sentence)
425
- if processed_sentence:
426
- final_sentences.add(processed_sentence)
427
-
428
- return "\n".join(sorted(list(final_sentences)))
429
-
430
-
431
-
432
- def get_contextual_sentences_DFS(text_content, keyword, depth=2):
433
- sentences = extract_sentences(text_content)
434
-
435
- # Build word-to-sentences mapping
436
- word_to_sentences = {}
437
- for sent in sentences:
438
- words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
439
- for word in words_in_sent:
440
- word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
441
-
442
- # Function to extract codes in a sentence
443
- def extract_codes(sentence):
444
- # Only codes like 'KSK1', 'MG272794', not pure numbers
445
- return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
446
-
447
- # DFS with priority based on distance to keyword and early stop if country found
448
- def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
449
- country = "unknown"
450
- if current_depth > max_depth:
451
- return country, False
452
-
453
- if current_word not in word_to_sentences:
454
- return country, False
455
-
456
- for sentence in word_to_sentences[current_word]:
457
- if sentence == parent_sentence:
458
- continue # avoid reusing the same sentence
459
-
460
- collected_sentences.add(sentence)
461
-
462
- #print("current_word:", current_word)
463
- small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
464
- #print(small_sen)
465
- country = model.get_country_from_text(small_sen)
466
- #print("small context country:", country)
467
- if country.lower() != "unknown":
468
- return country, True
469
- else:
470
- country = model.get_country_from_text(sentence)
471
- #print("full sentence country:", country)
472
- if country.lower() != "unknown":
473
- return country, True
474
-
475
- codes_in_sentence = extract_codes(sentence)
476
- idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
477
- if idx is None:
478
- continue
479
-
480
- sorted_children = sorted(
481
- [code for code in codes_in_sentence if code.lower() not in visited_words],
482
- key=lambda x: (abs(codes_in_sentence.index(x) - idx),
483
- 0 if codes_in_sentence.index(x) > idx else 1)
484
- )
485
-
486
- #print("sorted_children:", sorted_children)
487
- for child in sorted_children:
488
- child_lower = child.lower()
489
- if child_lower not in visited_words:
490
- visited_words.add(child_lower)
491
- country, should_stop = dfs_traverse(
492
- child_lower, current_depth + 1, max_depth,
493
- visited_words, collected_sentences, parent_sentence=sentence
494
- )
495
- if should_stop:
496
- return country, True
497
-
498
- return country, False
499
-
500
- # Begin DFS
501
- collected_sentences = set()
502
- visited_words = set([keyword.lower()])
503
- country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
504
-
505
- # Filter irrelevant sentences
506
- final_sentences = set()
507
- for sentence in collected_sentences:
508
- if not is_irrelevant_number_sequence(sentence):
509
- processed = remove_isolated_single_digits(sentence)
510
- if processed:
511
- final_sentences.add(processed)
512
- if not final_sentences:
513
- return country, text_content
514
- return country, "\n".join(sorted(list(final_sentences)))
515
-
516
- # Helper function for normalizing text for overlap comparison
517
- def normalize_for_overlap(s: str) -> str:
518
- s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
519
- s = re.sub(r'\s+', ' ', s).strip()
520
- return s
521
-
522
- def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
523
- if not text1: return text2
524
- if not text2: return text1
525
-
526
- # Case 1: text2 is fully contained in text1 or vice-versa
527
- if text2 in text1:
528
- return text1
529
- if text1 in text2:
530
- return text2
531
-
532
- # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
533
- # This is what your function was primarily designed for.
534
- # It looks for the overlap at the "junction" of text1 and text2.
535
-
536
- max_junction_overlap = 0
537
- for i in range(min(len(text1), len(text2)), 0, -1):
538
- suffix1 = text1[-i:]
539
- prefix2 = text2[:i]
540
- # Prioritize exact match, then normalized match
541
- if suffix1 == prefix2:
542
- max_junction_overlap = i
543
- break
544
- elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
545
- max_junction_overlap = i
546
- break # Take the first (longest) normalized match
547
-
548
- if max_junction_overlap > 0:
549
- merged_text = text1 + text2[max_junction_overlap:]
550
- return re.sub(r'\s+', ' ', merged_text).strip()
551
-
552
- # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
553
- # This addresses your specific test case where the overlap is at the very beginning of both strings.
554
- # This is often used when trying to deduplicate content that shares a common start.
555
-
556
- longest_common_prefix_len = 0
557
- min_len = min(len(text1), len(text2))
558
- for i in range(min_len):
559
- if text1[i] == text2[i]:
560
- longest_common_prefix_len = i + 1
561
- else:
562
- break
563
-
564
- # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
565
- # AND the remaining parts are distinct, then apply this merge.
566
- # This is a heuristic and might need fine-tuning.
567
- if longest_common_prefix_len > 0 and \
568
- text1[longest_common_prefix_len:].strip() and \
569
- text2[longest_common_prefix_len:].strip():
570
-
571
- # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
572
- # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
573
- # common prefix is "Hi, I am Vy."
574
- # Remaining text1: " Nice to meet you."
575
- # Remaining text2: " Goodbye Vy."
576
- # So we merge common_prefix + remaining_text1 + remaining_text2
577
-
578
- common_prefix_str = text1[:longest_common_prefix_len]
579
- remainder_text1 = text1[longest_common_prefix_len:]
580
- remainder_text2 = text2[longest_common_prefix_len:]
581
-
582
- merged_text = common_prefix_str + remainder_text1 + remainder_text2
583
- return re.sub(r'\s+', ' ', merged_text).strip()
584
-
585
-
586
- # If neither specific overlap type is found, just concatenate
587
- merged_text = text1 + text2
588
- return re.sub(r'\s+', ' ', merged_text).strip()
589
-
590
- from docx import Document
591
- from pipeline import upload_file_to_drive
592
- # def save_text_to_docx(text_content: str, file_path: str):
593
- # """
594
- # Saves a given text string into a .docx file.
595
-
596
- # Args:
597
- # text_content (str): The text string to save.
598
- # file_path (str): The full path including the filename where the .docx file will be saved.
599
- # Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
600
- # """
601
- # try:
602
- # document = Document()
603
-
604
- # # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
605
- # for paragraph_text in text_content.split('\n'):
606
- # document.add_paragraph(paragraph_text)
607
-
608
- # document.save(file_path)
609
- # print(f"Text successfully saved to '{file_path}'")
610
- # except Exception as e:
611
- # print(f"Error saving text to docx file: {e}")
612
- # def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
613
- # """
614
- # Saves a given text string into a .docx file locally, then uploads to Google Drive.
615
-
616
- # Args:
617
- # text_content (str): The text string to save.
618
- # filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
619
- # drive_folder_id (str): Google Drive folder ID where to upload the file.
620
- # """
621
- # try:
622
- # # βœ… Save to temporary local path first
623
- # print("file name: ", filename)
624
- # print("length text content: ", len(text_content))
625
- # local_path = os.path.join(tempfile.gettempdir(), filename)
626
- # document = Document()
627
- # for paragraph_text in text_content.split('\n'):
628
- # document.add_paragraph(paragraph_text)
629
- # document.save(local_path)
630
- # print(f"βœ… Text saved locally to: {local_path}")
631
-
632
- # # βœ… Upload to Drive
633
- # pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
634
- # print(f"βœ… Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
635
-
636
- # except Exception as e:
637
- # print(f"❌ Error saving or uploading DOCX: {e}")
638
- def save_text_to_docx(text_content: str, full_local_path: str):
639
- document = Document()
640
- for paragraph_text in text_content.split('\n'):
641
- document.add_paragraph(paragraph_text)
642
- document.save(full_local_path)
643
- print(f"βœ… Saved DOCX locally: {full_local_path}")
644
-
645
-
646
-
647
- '''2 scenerios:
648
- - quick look then found then deepdive and directly get location then stop
649
- - quick look then found then deepdive but not find location then hold the related words then
650
- look another files iteratively for each related word and find location and stop'''
651
- def extract_context(text, keyword, window=500):
652
- # firstly try accession number
653
- code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
654
-
655
- # Attempt to parse the keyword into its prefix and numerical part using re.search
656
- keyword_match = code_pattern.search(keyword)
657
-
658
- keyword_prefix = None
659
- keyword_num = None
660
-
661
- if keyword_match:
662
- keyword_prefix = keyword_match.group(1).lower()
663
- keyword_num = int(keyword_match.group(2))
664
- text = text.lower()
665
- idx = text.find(keyword.lower())
666
- if idx == -1:
667
- if keyword_prefix:
668
- idx = text.find(keyword_prefix)
669
- if idx == -1:
670
- return "Sample ID not found."
671
- return text[max(0, idx-window): idx+window]
672
- return text[max(0, idx-window): idx+window]
673
- def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
674
- cache = {}
675
- country = "unknown"
676
- output = ""
677
- tem_output, small_output = "",""
678
- keyword_appear = (False,"")
679
- keywords = []
680
- if isolate: keywords.append(isolate)
681
- if accession: keywords.append(accession)
682
- for f in filePaths:
683
- # scenerio 1: direct location: truncate the context and then use qa model?
684
- if keywords:
685
- for keyword in keywords:
686
- text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
687
- if keyword in final_input:
688
- context = extract_context(final_input, keyword)
689
- # quick look if country already in context and if yes then return
690
- country = model.get_country_from_text(context)
691
- if country != "unknown":
692
- return country, context, final_input
693
- else:
694
- country = model.get_country_from_text(final_input)
695
- if country != "unknown":
696
- return country, context, final_input
697
- else: # might be cross-ref
698
- keyword_appear = (True, f)
699
- cache[f] = context
700
- small_output = merge_texts_skipping_overlap(output, context) + "\n"
701
- chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
702
- countryBFS = model.get_country_from_text(chunkBFS)
703
- countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
704
- output = merge_texts_skipping_overlap(output, final_input)
705
- if countryDFS != "unknown" and countryBFS != "unknown":
706
- if len(chunkDFS) <= len(chunkBFS):
707
- return countryDFS, chunkDFS, output
708
- else:
709
- return countryBFS, chunkBFS, output
710
- else:
711
- if countryDFS != "unknown":
712
- return countryDFS, chunkDFS, output
713
- if countryBFS != "unknown":
714
- return countryBFS, chunkBFS, output
715
- else:
716
- # scenerio 2:
717
- '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
718
- but if we look at file 1 first then maybe we can have lookup dict which country
719
- such as Thailand as the key and its re'''
720
- cache[f] = final_input
721
- if keyword_appear[0] == True:
722
- for c in cache:
723
- if c!=keyword_appear[1]:
724
- if cache[c].lower() not in output.lower():
725
- output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
726
- chunkBFS = get_contextual_sentences_BFS(output, keyword)
727
- countryBFS = model.get_country_from_text(chunkBFS)
728
- countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
729
- if countryDFS != "unknown" and countryBFS != "unknown":
730
- if len(chunkDFS) <= len(chunkBFS):
731
- return countryDFS, chunkDFS, output
732
- else:
733
- return countryBFS, chunkBFS, output
734
- else:
735
- if countryDFS != "unknown":
736
- return countryDFS, chunkDFS, output
737
- if countryBFS != "unknown":
738
- return countryBFS, chunkBFS, output
739
- else:
740
- if cache[f].lower() not in output.lower():
741
- output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
742
- if len(output) == 0 or keyword_appear[0]==False:
743
- for c in cache:
744
- if cache[c].lower() not in output.lower():
745
- output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
 
746
  return country, "", output
 
1
+ import re
2
+ import os
3
+ #import streamlit as st
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ from docx import Document
8
+ import fitz
9
+ import spacy
10
+ from spacy.cli import download
11
+ from NER.PDF import pdf
12
+ from NER.WordDoc import wordDoc
13
+ from NER.html import extractHTML
14
+ from NER.word2Vec import word2vec
15
+ #from transformers import pipeline
16
+ import urllib.parse, requests
17
+ from pathlib import Path
18
+ import pandas as pd
19
+ import model
20
+ import pipeline
21
+ import tempfile
22
+ import nltk
23
+ nltk.download('punkt_tab')
24
+ def download_excel_file(url, save_path="temp.xlsx"):
25
+ if "view.officeapps.live.com" in url:
26
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
27
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
28
+ response = requests.get(real_url)
29
+ with open(save_path, "wb") as f:
30
+ f.write(response.content)
31
+ return save_path
32
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
33
+ response = requests.get(url)
34
+ response.raise_for_status() # Raises error if download fails
35
+ with open(save_path, "wb") as f:
36
+ f.write(response.content)
37
+ print(len(response.content))
38
+ return save_path
39
+ else:
40
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
41
+ return url
42
+ def extract_text(link,saveFolder):
43
+ try:
44
+ text = ""
45
+ name = link.split("/")[-1]
46
+ print("name: ", name)
47
+ #file_path = Path(saveFolder) / name
48
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
49
+ print("this is local temp path: ", local_temp_path)
50
+ if os.path.exists(local_temp_path):
51
+ input_to_class = local_temp_path
52
+ print("exist")
53
+ else:
54
+ #input_to_class = link # Let the class handle downloading
55
+ # 1. Check if file exists in shared Google Drive folder
56
+ file_id = pipeline.find_drive_file(name, saveFolder)
57
+ if file_id:
58
+ print("πŸ“₯ Downloading from Google Drive...")
59
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
60
+ else:
61
+ print("🌐 Downloading from web link...")
62
+ response = requests.get(link)
63
+ with open(local_temp_path, 'wb') as f:
64
+ f.write(response.content)
65
+ print("βœ… Saved locally.")
66
+
67
+ # 2. Upload to Drive so it's available for later
68
+ pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
69
+
70
+ input_to_class = local_temp_path
71
+ print(input_to_class)
72
+ # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
73
+ # pdf
74
+ if link.endswith(".pdf"):
75
+ # if file_path.is_file():
76
+ # link = saveFolder + "/" + name
77
+ # print("File exists.")
78
+ #p = pdf.PDF(local_temp_path, saveFolder)
79
+ print("inside pdf and input to class: ", input_to_class)
80
+ print("save folder in extract text: ", saveFolder)
81
+ p = pdf.PDF(input_to_class, saveFolder)
82
+ #p = pdf.PDF(link,saveFolder)
83
+ #text = p.extractTextWithPDFReader()
84
+ text = p.extractText()
85
+ print("text from pdf:")
86
+ print(text)
87
+ #text_exclude_table = p.extract_text_excluding_tables()
88
+ # worddoc
89
+ elif link.endswith(".doc") or link.endswith(".docx"):
90
+ #d = wordDoc.wordDoc(local_temp_path,saveFolder)
91
+ d = wordDoc.wordDoc(input_to_class,saveFolder)
92
+ text = d.extractTextByPage()
93
+ # html
94
+ else:
95
+ if link.split(".")[-1].lower() not in "xlsx":
96
+ if "http" in link or "html" in link:
97
+ print("html link: ", link)
98
+ html = extractHTML.HTML("",link)
99
+ text = html.getListSection() # the text already clean
100
+ print("text html: ")
101
+ print(text)
102
+ # Cleanup: delete the local temp file
103
+ if name:
104
+ if os.path.exists(local_temp_path):
105
+ os.remove(local_temp_path)
106
+ print(f"🧹 Deleted local temp file: {local_temp_path}")
107
+ print("done extract text")
108
+ except:
109
+ text = ""
110
+ return text
111
+
112
+ def extract_table(link,saveFolder):
113
+ try:
114
+ table = []
115
+ name = link.split("/")[-1]
116
+ #file_path = Path(saveFolder) / name
117
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
118
+ if os.path.exists(local_temp_path):
119
+ input_to_class = local_temp_path
120
+ print("exist")
121
+ else:
122
+ #input_to_class = link # Let the class handle downloading
123
+ # 1. Check if file exists in shared Google Drive folder
124
+ file_id = pipeline.find_drive_file(name, saveFolder)
125
+ if file_id:
126
+ print("πŸ“₯ Downloading from Google Drive...")
127
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
128
+ else:
129
+ print("🌐 Downloading from web link...")
130
+ response = requests.get(link)
131
+ with open(local_temp_path, 'wb') as f:
132
+ f.write(response.content)
133
+ print("βœ… Saved locally.")
134
+
135
+ # 2. Upload to Drive so it's available for later
136
+ pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
137
+
138
+ input_to_class = local_temp_path
139
+ print(input_to_class)
140
+ #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
141
+ # pdf
142
+ if link.endswith(".pdf"):
143
+ # if file_path.is_file():
144
+ # link = saveFolder + "/" + name
145
+ # print("File exists.")
146
+ #p = pdf.PDF(local_temp_path,saveFolder)
147
+ p = pdf.PDF(input_to_class,saveFolder)
148
+ table = p.extractTable()
149
+ # worddoc
150
+ elif link.endswith(".doc") or link.endswith(".docx"):
151
+ #d = wordDoc.wordDoc(local_temp_path,saveFolder)
152
+ d = wordDoc.wordDoc(input_to_class,saveFolder)
153
+ table = d.extractTableAsList()
154
+ # excel
155
+ elif link.split(".")[-1].lower() in "xlsx":
156
+ # download excel file if it not downloaded yet
157
+ savePath = saveFolder +"/"+ link.split("/")[-1]
158
+ excelPath = download_excel_file(link, savePath)
159
+ try:
160
+ #xls = pd.ExcelFile(excelPath)
161
+ xls = pd.ExcelFile(local_temp_path)
162
+ table_list = []
163
+ for sheet_name in xls.sheet_names:
164
+ df = pd.read_excel(xls, sheet_name=sheet_name)
165
+ cleaned_table = df.fillna("").astype(str).values.tolist()
166
+ table_list.append(cleaned_table)
167
+ table = table_list
168
+ except Exception as e:
169
+ print("❌ Failed to extract tables from Excel:", e)
170
+ # html
171
+ elif "http" in link or "html" in link:
172
+ html = extractHTML.HTML("",link)
173
+ table = html.extractTable() # table is a list
174
+ table = clean_tables_format(table)
175
+ # Cleanup: delete the local temp file
176
+ if os.path.exists(local_temp_path):
177
+ os.remove(local_temp_path)
178
+ print(f"🧹 Deleted local temp file: {local_temp_path}")
179
+ except:
180
+ table = []
181
+ return table
182
+
183
+ def clean_tables_format(tables):
184
+ """
185
+ Ensures all tables are in consistent format: List[List[List[str]]]
186
+ Cleans by:
187
+ - Removing empty strings and rows
188
+ - Converting all cells to strings
189
+ - Handling DataFrames and list-of-lists
190
+ """
191
+ cleaned = []
192
+ if tables:
193
+ for table in tables:
194
+ standardized = []
195
+
196
+ # Case 1: Pandas DataFrame
197
+ if isinstance(table, pd.DataFrame):
198
+ table = table.fillna("").astype(str).values.tolist()
199
+
200
+ # Case 2: List of Lists
201
+ if isinstance(table, list) and all(isinstance(row, list) for row in table):
202
+ for row in table:
203
+ filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
204
+ if filtered_row:
205
+ standardized.append(filtered_row)
206
+
207
+ if standardized:
208
+ cleaned.append(standardized)
209
+
210
+ return cleaned
211
+
212
+ import json
213
+ def normalize_text_for_comparison(s: str) -> str:
214
+ """
215
+ Normalizes text for robust comparison by:
216
+ 1. Converting to lowercase.
217
+ 2. Replacing all types of newlines with a single consistent newline (\n).
218
+ 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
219
+ 4. Stripping leading/trailing whitespace from the entire string.
220
+ """
221
+ s = s.lower()
222
+ s = s.replace('\r\n', '\n') # Handle Windows newlines
223
+ s = s.replace('\r', '\n') # Handle Mac classic newlines
224
+
225
+ # Replace sequences of whitespace (including multiple newlines) with a single space
226
+ # This might be too aggressive if you need to preserve paragraph breaks,
227
+ # but good for exact word-sequence matching.
228
+ s = re.sub(r'\s+', ' ', s)
229
+
230
+ return s.strip()
231
+ def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
232
+ """
233
+ Merge cleaned text and table into one string for LLM input.
234
+ - Avoids duplicating tables already in text
235
+ - Extracts only relevant rows from large tables
236
+ - Skips or saves oversized tables
237
+ """
238
+ import importlib
239
+ json = importlib.import_module("json")
240
+
241
+ def estimate_tokens(text_str):
242
+ try:
243
+ enc = tiktoken.get_encoding(tokenizer)
244
+ return len(enc.encode(text_str))
245
+ except:
246
+ return len(text_str) // 4 # Fallback estimate
247
+
248
+ def is_table_relevant(table, keywords, accession_id=None):
249
+ flat = " ".join(" ".join(row).lower() for row in table)
250
+ if accession_id and accession_id.lower() in flat:
251
+ return True
252
+ return any(kw.lower() in flat for kw in keywords)
253
+ preview, preview1 = "",""
254
+ llm_input = "## Document Text\n" + text.strip() + "\n"
255
+ clean_text = normalize_text_for_comparison(text)
256
+
257
+ if tables:
258
+ for idx, table in enumerate(tables):
259
+ keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
260
+ if accession_id: keywords += [accession_id.lower()]
261
+ if isolate: keywords += [isolate.lower()]
262
+ if is_table_relevant(table, keywords, accession_id):
263
+ if len(table) > 0:
264
+ for tab in table:
265
+ preview = " ".join(tab) if tab else ""
266
+ preview1 = "\n".join(tab) if tab else ""
267
+ clean_preview = normalize_text_for_comparison(preview)
268
+ clean_preview1 = normalize_text_for_comparison(preview1)
269
+ if clean_preview not in clean_text:
270
+ if clean_preview1 not in clean_text:
271
+ table_str = json.dumps([tab], indent=2)
272
+ llm_input += f"## Table {idx+1}\n{table_str}\n"
273
+ return llm_input.strip()
274
+
275
+ def preprocess_document(link, saveFolder, accession=None, isolate=None):
276
+ try:
277
+ text = extract_text(link, saveFolder)
278
+ print("text and link")
279
+ print(link)
280
+ print(text)
281
+ except: text = ""
282
+ try:
283
+ tables = extract_table(link, saveFolder)
284
+ except: tables = []
285
+ if accession: accession = accession
286
+ if isolate: isolate = isolate
287
+ try:
288
+ final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
289
+ except: final_input = ""
290
+ return text, tables, final_input
291
+
292
+ def extract_sentences(text):
293
+ sentences = re.split(r'(?<=[.!?])\s+', text)
294
+ return [s.strip() for s in sentences if s.strip()]
295
+
296
+ def is_irrelevant_number_sequence(text):
297
+ if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
298
+ return False
299
+ word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
300
+ number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
301
+ total_tokens = len(re.findall(r'\S+', text))
302
+ if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
303
+ return True
304
+ elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
305
+ return True
306
+ return False
307
+
308
+ def remove_isolated_single_digits(sentence):
309
+ tokens = sentence.split()
310
+ filtered_tokens = []
311
+ for token in tokens:
312
+ if token == '0' or token == '1':
313
+ pass
314
+ else:
315
+ filtered_tokens.append(token)
316
+ return ' '.join(filtered_tokens).strip()
317
+
318
+ def get_contextual_sentences_BFS(text_content, keyword, depth=2):
319
+ def extract_codes(sentence):
320
+ # Match codes like 'A1YU101', 'KM1', 'MO6' β€” at least 2 letters + numbers
321
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
322
+ sentences = extract_sentences(text_content)
323
+ relevant_sentences = set()
324
+ initial_keywords = set()
325
+
326
+ # Define a regex to capture codes like A1YU101 or KM1
327
+ # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
328
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
329
+
330
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
331
+ keyword_match = code_pattern.search(keyword)
332
+
333
+ keyword_prefix = None
334
+ keyword_num = None
335
+
336
+ if keyword_match:
337
+ keyword_prefix = keyword_match.group(1).lower()
338
+ keyword_num = int(keyword_match.group(2))
339
+
340
+ for sentence in sentences:
341
+ sentence_added = False
342
+
343
+ # 1. Check for exact match of the keyword
344
+ if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
345
+ relevant_sentences.add(sentence.strip())
346
+ initial_keywords.add(keyword.lower())
347
+ sentence_added = True
348
+
349
+ # 2. Check for range patterns (e.g., A1YU101-A1YU137)
350
+ # The range pattern should be broad enough to capture the full code string within the range.
351
+ range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
352
+ range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
353
+
354
+ for r_match in range_matches:
355
+ start_code_str = r_match.group(1)
356
+ end_code_str = r_match.group(2)
357
+
358
+ # CRITICAL FIX: Use code_pattern.search for start_match and end_match
359
+ start_match = code_pattern.search(start_code_str)
360
+ end_match = code_pattern.search(end_code_str)
361
+
362
+ if keyword_prefix and keyword_num is not None and start_match and end_match:
363
+ start_prefix = start_match.group(1).lower()
364
+ end_prefix = end_match.group(1).lower()
365
+ start_num = int(start_match.group(2))
366
+ end_num = int(end_match.group(2))
367
+
368
+ # Check if the keyword's prefix matches and its number is within the range
369
+ if keyword_prefix == start_prefix and \
370
+ keyword_prefix == end_prefix and \
371
+ start_num <= keyword_num <= end_num:
372
+ relevant_sentences.add(sentence.strip())
373
+ initial_keywords.add(start_code_str.lower())
374
+ initial_keywords.add(end_code_str.lower())
375
+ sentence_added = True
376
+ break # Only need to find one matching range per sentence
377
+
378
+ # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
379
+ # to initial_keywords to ensure graph traversal from related terms.
380
+ if sentence_added:
381
+ for word in extract_codes(sentence):
382
+ initial_keywords.add(word.lower())
383
+
384
+
385
+ # Build word_to_sentences mapping for all sentences
386
+ word_to_sentences = {}
387
+ for sent in sentences:
388
+ codes_in_sent = set(extract_codes(sent))
389
+ for code in codes_in_sent:
390
+ word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
391
+
392
+
393
+ # Build the graph
394
+ graph = {}
395
+ for sent in sentences:
396
+ codes = set(extract_codes(sent))
397
+ for word1 in codes:
398
+ word1_lower = word1.lower()
399
+ graph.setdefault(word1_lower, set())
400
+ for word2 in codes:
401
+ word2_lower = word2.lower()
402
+ if word1_lower != word2_lower:
403
+ graph[word1_lower].add(word2_lower)
404
+
405
+
406
+ # Perform BFS/graph traversal
407
+ queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
408
+ visited_words = set(initial_keywords)
409
+
410
+ while queue:
411
+ current_word, level = queue.pop(0)
412
+ if level >= depth:
413
+ continue
414
+
415
+ relevant_sentences.update(word_to_sentences.get(current_word, []))
416
+
417
+ for neighbor in graph.get(current_word, []):
418
+ if neighbor not in visited_words:
419
+ visited_words.add(neighbor)
420
+ queue.append((neighbor, level + 1))
421
+
422
+ final_sentences = set()
423
+ for sentence in relevant_sentences:
424
+ if not is_irrelevant_number_sequence(sentence):
425
+ processed_sentence = remove_isolated_single_digits(sentence)
426
+ if processed_sentence:
427
+ final_sentences.add(processed_sentence)
428
+
429
+ return "\n".join(sorted(list(final_sentences)))
430
+
431
+
432
+
433
+ def get_contextual_sentences_DFS(text_content, keyword, depth=2):
434
+ sentences = extract_sentences(text_content)
435
+
436
+ # Build word-to-sentences mapping
437
+ word_to_sentences = {}
438
+ for sent in sentences:
439
+ words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
440
+ for word in words_in_sent:
441
+ word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
442
+
443
+ # Function to extract codes in a sentence
444
+ def extract_codes(sentence):
445
+ # Only codes like 'KSK1', 'MG272794', not pure numbers
446
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
447
+
448
+ # DFS with priority based on distance to keyword and early stop if country found
449
+ def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
450
+ country = "unknown"
451
+ if current_depth > max_depth:
452
+ return country, False
453
+
454
+ if current_word not in word_to_sentences:
455
+ return country, False
456
+
457
+ for sentence in word_to_sentences[current_word]:
458
+ if sentence == parent_sentence:
459
+ continue # avoid reusing the same sentence
460
+
461
+ collected_sentences.add(sentence)
462
+
463
+ #print("current_word:", current_word)
464
+ small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
465
+ #print(small_sen)
466
+ country = model.get_country_from_text(small_sen)
467
+ #print("small context country:", country)
468
+ if country.lower() != "unknown":
469
+ return country, True
470
+ else:
471
+ country = model.get_country_from_text(sentence)
472
+ #print("full sentence country:", country)
473
+ if country.lower() != "unknown":
474
+ return country, True
475
+
476
+ codes_in_sentence = extract_codes(sentence)
477
+ idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
478
+ if idx is None:
479
+ continue
480
+
481
+ sorted_children = sorted(
482
+ [code for code in codes_in_sentence if code.lower() not in visited_words],
483
+ key=lambda x: (abs(codes_in_sentence.index(x) - idx),
484
+ 0 if codes_in_sentence.index(x) > idx else 1)
485
+ )
486
+
487
+ #print("sorted_children:", sorted_children)
488
+ for child in sorted_children:
489
+ child_lower = child.lower()
490
+ if child_lower not in visited_words:
491
+ visited_words.add(child_lower)
492
+ country, should_stop = dfs_traverse(
493
+ child_lower, current_depth + 1, max_depth,
494
+ visited_words, collected_sentences, parent_sentence=sentence
495
+ )
496
+ if should_stop:
497
+ return country, True
498
+
499
+ return country, False
500
+
501
+ # Begin DFS
502
+ collected_sentences = set()
503
+ visited_words = set([keyword.lower()])
504
+ country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
505
+
506
+ # Filter irrelevant sentences
507
+ final_sentences = set()
508
+ for sentence in collected_sentences:
509
+ if not is_irrelevant_number_sequence(sentence):
510
+ processed = remove_isolated_single_digits(sentence)
511
+ if processed:
512
+ final_sentences.add(processed)
513
+ if not final_sentences:
514
+ return country, text_content
515
+ return country, "\n".join(sorted(list(final_sentences)))
516
+
517
+ # Helper function for normalizing text for overlap comparison
518
+ def normalize_for_overlap(s: str) -> str:
519
+ s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
520
+ s = re.sub(r'\s+', ' ', s).strip()
521
+ return s
522
+
523
+ def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
524
+ if not text1: return text2
525
+ if not text2: return text1
526
+
527
+ # Case 1: text2 is fully contained in text1 or vice-versa
528
+ if text2 in text1:
529
+ return text1
530
+ if text1 in text2:
531
+ return text2
532
+
533
+ # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
534
+ # This is what your function was primarily designed for.
535
+ # It looks for the overlap at the "junction" of text1 and text2.
536
+
537
+ max_junction_overlap = 0
538
+ for i in range(min(len(text1), len(text2)), 0, -1):
539
+ suffix1 = text1[-i:]
540
+ prefix2 = text2[:i]
541
+ # Prioritize exact match, then normalized match
542
+ if suffix1 == prefix2:
543
+ max_junction_overlap = i
544
+ break
545
+ elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
546
+ max_junction_overlap = i
547
+ break # Take the first (longest) normalized match
548
+
549
+ if max_junction_overlap > 0:
550
+ merged_text = text1 + text2[max_junction_overlap:]
551
+ return re.sub(r'\s+', ' ', merged_text).strip()
552
+
553
+ # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
554
+ # This addresses your specific test case where the overlap is at the very beginning of both strings.
555
+ # This is often used when trying to deduplicate content that shares a common start.
556
+
557
+ longest_common_prefix_len = 0
558
+ min_len = min(len(text1), len(text2))
559
+ for i in range(min_len):
560
+ if text1[i] == text2[i]:
561
+ longest_common_prefix_len = i + 1
562
+ else:
563
+ break
564
+
565
+ # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
566
+ # AND the remaining parts are distinct, then apply this merge.
567
+ # This is a heuristic and might need fine-tuning.
568
+ if longest_common_prefix_len > 0 and \
569
+ text1[longest_common_prefix_len:].strip() and \
570
+ text2[longest_common_prefix_len:].strip():
571
+
572
+ # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
573
+ # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
574
+ # common prefix is "Hi, I am Vy."
575
+ # Remaining text1: " Nice to meet you."
576
+ # Remaining text2: " Goodbye Vy."
577
+ # So we merge common_prefix + remaining_text1 + remaining_text2
578
+
579
+ common_prefix_str = text1[:longest_common_prefix_len]
580
+ remainder_text1 = text1[longest_common_prefix_len:]
581
+ remainder_text2 = text2[longest_common_prefix_len:]
582
+
583
+ merged_text = common_prefix_str + remainder_text1 + remainder_text2
584
+ return re.sub(r'\s+', ' ', merged_text).strip()
585
+
586
+
587
+ # If neither specific overlap type is found, just concatenate
588
+ merged_text = text1 + text2
589
+ return re.sub(r'\s+', ' ', merged_text).strip()
590
+
591
+ from docx import Document
592
+ from pipeline import upload_file_to_drive
593
+ # def save_text_to_docx(text_content: str, file_path: str):
594
+ # """
595
+ # Saves a given text string into a .docx file.
596
+
597
+ # Args:
598
+ # text_content (str): The text string to save.
599
+ # file_path (str): The full path including the filename where the .docx file will be saved.
600
+ # Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
601
+ # """
602
+ # try:
603
+ # document = Document()
604
+
605
+ # # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
606
+ # for paragraph_text in text_content.split('\n'):
607
+ # document.add_paragraph(paragraph_text)
608
+
609
+ # document.save(file_path)
610
+ # print(f"Text successfully saved to '{file_path}'")
611
+ # except Exception as e:
612
+ # print(f"Error saving text to docx file: {e}")
613
+ # def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
614
+ # """
615
+ # Saves a given text string into a .docx file locally, then uploads to Google Drive.
616
+
617
+ # Args:
618
+ # text_content (str): The text string to save.
619
+ # filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
620
+ # drive_folder_id (str): Google Drive folder ID where to upload the file.
621
+ # """
622
+ # try:
623
+ # # βœ… Save to temporary local path first
624
+ # print("file name: ", filename)
625
+ # print("length text content: ", len(text_content))
626
+ # local_path = os.path.join(tempfile.gettempdir(), filename)
627
+ # document = Document()
628
+ # for paragraph_text in text_content.split('\n'):
629
+ # document.add_paragraph(paragraph_text)
630
+ # document.save(local_path)
631
+ # print(f"βœ… Text saved locally to: {local_path}")
632
+
633
+ # # βœ… Upload to Drive
634
+ # pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
635
+ # print(f"βœ… Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
636
+
637
+ # except Exception as e:
638
+ # print(f"❌ Error saving or uploading DOCX: {e}")
639
+ def save_text_to_docx(text_content: str, full_local_path: str):
640
+ document = Document()
641
+ for paragraph_text in text_content.split('\n'):
642
+ document.add_paragraph(paragraph_text)
643
+ document.save(full_local_path)
644
+ print(f"βœ… Saved DOCX locally: {full_local_path}")
645
+
646
+
647
+
648
+ '''2 scenerios:
649
+ - quick look then found then deepdive and directly get location then stop
650
+ - quick look then found then deepdive but not find location then hold the related words then
651
+ look another files iteratively for each related word and find location and stop'''
652
+ def extract_context(text, keyword, window=500):
653
+ # firstly try accession number
654
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
655
+
656
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
657
+ keyword_match = code_pattern.search(keyword)
658
+
659
+ keyword_prefix = None
660
+ keyword_num = None
661
+
662
+ if keyword_match:
663
+ keyword_prefix = keyword_match.group(1).lower()
664
+ keyword_num = int(keyword_match.group(2))
665
+ text = text.lower()
666
+ idx = text.find(keyword.lower())
667
+ if idx == -1:
668
+ if keyword_prefix:
669
+ idx = text.find(keyword_prefix)
670
+ if idx == -1:
671
+ return "Sample ID not found."
672
+ return text[max(0, idx-window): idx+window]
673
+ return text[max(0, idx-window): idx+window]
674
+ def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
675
+ cache = {}
676
+ country = "unknown"
677
+ output = ""
678
+ tem_output, small_output = "",""
679
+ keyword_appear = (False,"")
680
+ keywords = []
681
+ if isolate: keywords.append(isolate)
682
+ if accession: keywords.append(accession)
683
+ for f in filePaths:
684
+ # scenerio 1: direct location: truncate the context and then use qa model?
685
+ if keywords:
686
+ for keyword in keywords:
687
+ text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
688
+ if keyword in final_input:
689
+ context = extract_context(final_input, keyword)
690
+ # quick look if country already in context and if yes then return
691
+ country = model.get_country_from_text(context)
692
+ if country != "unknown":
693
+ return country, context, final_input
694
+ else:
695
+ country = model.get_country_from_text(final_input)
696
+ if country != "unknown":
697
+ return country, context, final_input
698
+ else: # might be cross-ref
699
+ keyword_appear = (True, f)
700
+ cache[f] = context
701
+ small_output = merge_texts_skipping_overlap(output, context) + "\n"
702
+ chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
703
+ countryBFS = model.get_country_from_text(chunkBFS)
704
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
705
+ output = merge_texts_skipping_overlap(output, final_input)
706
+ if countryDFS != "unknown" and countryBFS != "unknown":
707
+ if len(chunkDFS) <= len(chunkBFS):
708
+ return countryDFS, chunkDFS, output
709
+ else:
710
+ return countryBFS, chunkBFS, output
711
+ else:
712
+ if countryDFS != "unknown":
713
+ return countryDFS, chunkDFS, output
714
+ if countryBFS != "unknown":
715
+ return countryBFS, chunkBFS, output
716
+ else:
717
+ # scenerio 2:
718
+ '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
719
+ but if we look at file 1 first then maybe we can have lookup dict which country
720
+ such as Thailand as the key and its re'''
721
+ cache[f] = final_input
722
+ if keyword_appear[0] == True:
723
+ for c in cache:
724
+ if c!=keyword_appear[1]:
725
+ if cache[c].lower() not in output.lower():
726
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
727
+ chunkBFS = get_contextual_sentences_BFS(output, keyword)
728
+ countryBFS = model.get_country_from_text(chunkBFS)
729
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
730
+ if countryDFS != "unknown" and countryBFS != "unknown":
731
+ if len(chunkDFS) <= len(chunkBFS):
732
+ return countryDFS, chunkDFS, output
733
+ else:
734
+ return countryBFS, chunkBFS, output
735
+ else:
736
+ if countryDFS != "unknown":
737
+ return countryDFS, chunkDFS, output
738
+ if countryBFS != "unknown":
739
+ return countryBFS, chunkBFS, output
740
+ else:
741
+ if cache[f].lower() not in output.lower():
742
+ output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
743
+ if len(output) == 0 or keyword_appear[0]==False:
744
+ for c in cache:
745
+ if cache[c].lower() not in output.lower():
746
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
747
  return country, "", output