Spaces:
Running
Running
update new codes
Browse files- DefaultPackages/__pycache__/__init__.cpython-310.pyc +0 -0
- DefaultPackages/__pycache__/__init__.cpython-311.pyc +0 -0
- DefaultPackages/__pycache__/openFile.cpython-310.pyc +0 -0
- DefaultPackages/__pycache__/openFile.cpython-311.pyc +0 -0
- DefaultPackages/__pycache__/saveFile.cpython-310.pyc +0 -0
- DefaultPackages/__pycache__/saveFile.cpython-311.pyc +0 -0
- NER/PDF/__pycache__/pdf.cpython-310.pyc +0 -0
- NER/PDF/__pycache__/pdf.cpython-311.pyc +0 -0
- NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc +0 -0
- NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc +0 -0
- NER/__pycache__/cleanText.cpython-310.pyc +0 -0
- NER/__pycache__/cleanText.cpython-311.pyc +0 -0
- NER/html/__pycache__/extractHTML.cpython-310.pyc +0 -0
- NER/html/__pycache__/extractHTML.cpython-311.pyc +0 -0
- NER/word2Vec/__pycache__/word2vec.cpython-310.pyc +0 -0
- NER/word2Vec/__pycache__/word2vec.cpython-311.pyc +0 -0
- NER/word2Vec/heuristic.py +52 -0
- NER/word2Vec/testModel/test_model.model +3 -0
- NER/word2Vec/testModel/test_model.txt +25 -0
- NER/word2Vec/testModel/test_model_updated.model +3 -0
- NER/word2Vec/word2vec.py +539 -101
- README.md +74 -15
- accessions.csv +6 -0
- accessions.xlsx +0 -0
- app.py +396 -36
- data/user_fb/feedback_mtdna.xlsx +0 -0
- env.yaml +8 -0
- installedAndUsedRequirements.txt +637 -0
- mtdna_backend.py +252 -0
- mtdna_classifier.py +519 -322
- mtdna_ui.py +210 -0
- output.json +276 -0
- output.txt +176 -0
- requirements.txt +29 -24
- setup.sh +8 -8
- standardize_location.py +74 -0
- upgradeClassify.py +276 -0
DefaultPackages/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/DefaultPackages/__pycache__/__init__.cpython-310.pyc and b/DefaultPackages/__pycache__/__init__.cpython-310.pyc differ
|
|
DefaultPackages/__pycache__/__init__.cpython-311.pyc
CHANGED
Binary files a/DefaultPackages/__pycache__/__init__.cpython-311.pyc and b/DefaultPackages/__pycache__/__init__.cpython-311.pyc differ
|
|
DefaultPackages/__pycache__/openFile.cpython-310.pyc
CHANGED
Binary files a/DefaultPackages/__pycache__/openFile.cpython-310.pyc and b/DefaultPackages/__pycache__/openFile.cpython-310.pyc differ
|
|
DefaultPackages/__pycache__/openFile.cpython-311.pyc
CHANGED
Binary files a/DefaultPackages/__pycache__/openFile.cpython-311.pyc and b/DefaultPackages/__pycache__/openFile.cpython-311.pyc differ
|
|
DefaultPackages/__pycache__/saveFile.cpython-310.pyc
CHANGED
Binary files a/DefaultPackages/__pycache__/saveFile.cpython-310.pyc and b/DefaultPackages/__pycache__/saveFile.cpython-310.pyc differ
|
|
DefaultPackages/__pycache__/saveFile.cpython-311.pyc
CHANGED
Binary files a/DefaultPackages/__pycache__/saveFile.cpython-311.pyc and b/DefaultPackages/__pycache__/saveFile.cpython-311.pyc differ
|
|
NER/PDF/__pycache__/pdf.cpython-310.pyc
ADDED
Binary file (4.27 kB). View file
|
|
NER/PDF/__pycache__/pdf.cpython-311.pyc
CHANGED
Binary files a/NER/PDF/__pycache__/pdf.cpython-311.pyc and b/NER/PDF/__pycache__/pdf.cpython-311.pyc differ
|
|
NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc
ADDED
Binary file (3.76 kB). View file
|
|
NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc
CHANGED
Binary files a/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc and b/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc differ
|
|
NER/__pycache__/cleanText.cpython-310.pyc
ADDED
Binary file (3.42 kB). View file
|
|
NER/__pycache__/cleanText.cpython-311.pyc
CHANGED
Binary files a/NER/__pycache__/cleanText.cpython-311.pyc and b/NER/__pycache__/cleanText.cpython-311.pyc differ
|
|
NER/html/__pycache__/extractHTML.cpython-310.pyc
ADDED
Binary file (5.07 kB). View file
|
|
NER/html/__pycache__/extractHTML.cpython-311.pyc
CHANGED
Binary files a/NER/html/__pycache__/extractHTML.cpython-311.pyc and b/NER/html/__pycache__/extractHTML.cpython-311.pyc differ
|
|
NER/word2Vec/__pycache__/word2vec.cpython-310.pyc
ADDED
Binary file (7.81 kB). View file
|
|
NER/word2Vec/__pycache__/word2vec.cpython-311.pyc
CHANGED
Binary files a/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc and b/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc differ
|
|
NER/word2Vec/heuristic.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
class HeuristicManager:
|
5 |
+
def __init__(self, model, log_file="heuristic_log.txt", min_similarity_threshold=0.5, min_new_data_len=50):
|
6 |
+
self.model = model
|
7 |
+
self.min_similarity_threshold = min_similarity_threshold
|
8 |
+
self.min_new_data_len = min_new_data_len
|
9 |
+
self.log_file = log_file
|
10 |
+
logging.basicConfig(filename=self.log_file, level=logging.INFO)
|
11 |
+
|
12 |
+
def log(self, message):
|
13 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
14 |
+
logging.info(f"[{timestamp}] {message}")
|
15 |
+
print(f"[{timestamp}] {message}")
|
16 |
+
|
17 |
+
def check_similarity(self, test_terms):
|
18 |
+
triggers = []
|
19 |
+
for term in test_terms:
|
20 |
+
try:
|
21 |
+
sim = self.model.wv.most_similar(term)[0][1]
|
22 |
+
if sim < self.min_similarity_threshold:
|
23 |
+
triggers.append(f"Low similarity for '{term}': {sim}")
|
24 |
+
except KeyError:
|
25 |
+
triggers.append(f"'{term}' not in vocabulary")
|
26 |
+
return triggers
|
27 |
+
|
28 |
+
def check_metadata(self, metadata):
|
29 |
+
triggers = []
|
30 |
+
if any(keyword in str(metadata).lower() for keyword in ["haplogroup b", "eastasia", "asian"]):
|
31 |
+
triggers.append("Detected new haplogroup or regional bias: 'Asian' or 'B'")
|
32 |
+
return triggers
|
33 |
+
|
34 |
+
def check_new_data_volume(self, new_data):
|
35 |
+
if len(new_data) < self.min_new_data_len:
|
36 |
+
return ["Not enough new data to justify retraining"]
|
37 |
+
return []
|
38 |
+
|
39 |
+
def should_retrain(self, test_terms, new_data, metadata):
|
40 |
+
triggers = []
|
41 |
+
triggers += self.check_similarity(test_terms)
|
42 |
+
triggers += self.check_metadata(metadata)
|
43 |
+
triggers += self.check_new_data_volume(new_data)
|
44 |
+
|
45 |
+
if triggers:
|
46 |
+
self.log("Retraining triggered due to:")
|
47 |
+
for trigger in triggers:
|
48 |
+
self.log(f" - {trigger}")
|
49 |
+
return True
|
50 |
+
else:
|
51 |
+
self.log("No retraining needed.")
|
52 |
+
return False
|
NER/word2Vec/testModel/test_model.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:734185116a1d2099dba0d04efc0eb1b7e0e8213fe1259b57bbcb7aaac3cd46ea
|
3 |
+
size 133
|
NER/word2Vec/testModel/test_model.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
24 100
|
2 |
+
dna -0.0005385255 0.0002430238 0.005111818 0.009016951 -0.009293036 -0.007109866 0.0064572324 0.008987154 -0.0050192317 -0.0037659889 0.0073785 -0.0015431087 -0.0045221853 0.006557529 -0.004854595 -0.0018278129 0.002881375 0.0010002495 -0.00829578 -0.009462763 0.007312361 0.0050688535 0.0067577288 0.0007685764 0.006347226 -0.003397316 -0.0009421973 0.0057741464 -0.007532499 -0.0039303782 -0.0075064874 -0.0009439946 0.009533595 -0.0073319245 -0.002333888 -0.0019326513 0.0080786925 -0.005930193 3.549824e-05 -0.00475331 -0.0095964745 0.005000012 -0.008770563 -0.0043735923 -2.9246534e-05 -0.00030931013 -0.007669701 0.009599569 0.004982613 0.009233704 -0.008148657 0.004488859 -0.0041414667 0.00081141765 0.008487031 -0.00446156 0.0045125154 -0.006793622 -0.0035560841 0.009394251 -0.0015774865 0.00032431752 -0.004129968 -0.0076763057 -0.0015165819 0.0024841889 -0.00088440755 0.0055526863 -0.0027446826 0.002259023 0.0054701897 0.008356409 -0.0014508999 -0.009201209 0.004375452 0.00058271736 0.0074576377 -0.00080706284 -0.0026372937 -0.008752899 -0.00087625836 0.00282087 0.005398569 0.0070530027 -0.0057170955 0.0018605916 0.006099475 -0.0048024287 -0.003104349 0.0067992285 0.0016360026 0.00019302641 0.00348545 0.00021818833 0.009630539 0.0050670514 -0.008908632 -0.007042295 0.0009007676 0.0063867364
|
3 |
+
from -0.00861988 0.0036778022 0.005193427 0.005744547 0.0074751326 -0.0061739217 0.0011082628 0.0060625207 -0.0028567386 -0.006184132 -0.00041290926 -0.008384168 -0.0055893976 0.007104685 0.003362318 0.007228353 0.0068033817 0.007533677 -0.003792071 -0.000581891 0.0023577819 -0.0045196284 0.008395244 -0.009858517 0.006761404 0.0029261683 -0.004930935 0.0043925527 -0.0017370671 0.006713542 0.009974645 -0.0043735756 -0.0006050642 -0.005716478 0.003858548 0.002799571 0.00690247 0.00610934 0.009526547 0.009269763 0.007910428 -0.007008808 -0.00916451 -0.00033672128 -0.0030898354 0.007890073 0.005923819 -0.001552973 0.001516021 0.0017856265 0.007822941 -0.009514211 -0.00020886083 0.0034666678 -0.00094713847 0.008384139 0.009009283 0.0065234327 -0.0007208324 0.007705209 -0.00853289 0.0032079336 -0.004625999 -0.0050743804 0.0035901158 0.005388813 0.007766254 -0.005744939 0.0074327383 0.006626378 -0.003704473 -0.008735958 0.005445474 0.0065230317 -0.000784768 -0.006700798 -0.007075852 -0.002488528 0.0051543443 -0.0036620772 -0.00938257 0.003815971 0.004890136 -0.0064404616 0.0012033634 -0.0020763231 2.994902e-05 -0.0098790005 0.002700701 -0.004756241 0.0011076172 -0.0015674155 0.0022046466 -0.00787344 -0.0027070795 0.002668326 0.0053478787 -0.002396734 -0.009512201 0.0045024394
|
4 |
+
mtdna 8.645293e-05 0.003076037 -0.006815487 -0.0013743688 0.0076927417 0.0073529496 -0.0036715195 0.0026677884 -0.008309281 0.00619759 -0.00463892 -0.0031715294 0.009313415 0.00088058383 0.0074962615 -0.00608139 0.005167896 0.009930803 -0.008471472 -0.0051321597 -0.007057574 -0.0048644566 -0.003772668 -0.008518714 0.0079532955 -0.0048361127 0.008438283 0.005270068 -0.0065578814 0.0039592343 0.005482614 -0.007444929 -0.0074228924 -0.002492343 -0.008628872 -0.0015748737 -0.00038757667 0.0032959366 0.0014325404 -0.00088083016 -0.005591098 0.0017297626 -0.00089552783 0.0068030986 0.0039881677 0.004533183 0.0014284542 -0.0027126821 -0.0043595196 -0.0010315293 0.0014437438 -0.0026617546 -0.0070882514 -0.007825746 -0.009136036 -0.005931676 -0.001850123 -0.004323682 -0.0064626597 -0.0037265678 0.004296681 -0.0037233941 0.008404572 0.001539496 -0.007246572 0.009443451 0.007636867 0.0055208146 -0.0068550883 0.0058190743 0.004034045 0.005188155 0.0042629624 0.0019477821 -0.003167882 0.008342064 0.009619138 0.0038047181 -0.0028461283 5.6938893e-07 0.0012001555 -0.0084682545 -0.008234347 -0.00023238244 0.0012304098 -0.005750644 -0.0047139754 -0.0073490315 0.008316314 0.00010242269 -0.004513882 0.005704978 0.009199796 -0.004097329 0.007985275 0.005386452 0.0058861696 0.0005043713 0.008208188 -0.0070221694
|
5 |
+
in -0.008226077 0.009303831 -0.00018710589 -0.0019704443 0.0046143015 -0.004104392 0.0027394402 0.006979235 0.0060486975 -0.0075411424 0.00939576 0.00465202 0.004012172 -0.006245291 0.008499353 -0.002164537 0.008836197 -0.005347778 -0.008136817 0.006804632 0.0016640095 -0.0022142953 0.009522269 0.009494823 -0.0097868545 0.0025105644 0.0061560757 0.0038842657 0.0020310257 0.00043876152 0.00068163266 -0.0038464246 -0.007141551 -0.0020813115 0.003930752 0.008838634 0.009274302 -0.0059668766 -0.009419525 0.009759848 0.0034291998 0.005158939 0.006265811 -0.0027623416 0.007310359 0.0027998323 0.0028576967 -0.0023982434 -0.003139742 -0.0023701421 0.0042809984 4.8589092e-05 -0.009614385 -0.00968607 -0.006160773 -0.00011437661 0.0019819876 0.009428 0.0056011924 -0.004298171 0.00026028603 0.004974084 0.007744428 -0.001135339 0.004278759 -0.0057750097 -0.0008068469 0.00811882 -0.002369315 -0.009674972 0.0058119837 -0.0039038642 -0.001220125 0.010017389 -0.002241946 -0.0047185957 -0.0053141676 0.0069846674 -0.005741993 0.002120917 -0.0052751247 0.00613608 0.0043662013 0.0026298608 -0.0015129133 -0.002735619 0.008999614 0.0052172863 -0.0021470466 -0.009465257 -0.007413552 -0.0010587372 -0.00078251073 -0.0025414668 0.009710779 -0.00044944565 0.005915 -0.007467981 -0.0024928953 -0.005583053
|
6 |
+
european -0.007147033 0.0012623417 -0.007189088 -0.0022513974 0.0037773554 0.005857864 0.0012027922 0.0021598793 -0.004109796 0.007198152 -0.006319537 0.0046250015 -0.008186181 0.0020334523 -0.0049318667 -0.0042960607 -0.0030848773 0.0056965156 0.0057683894 -0.004991361 0.00076802005 -0.008515792 0.0078122346 0.009295911 -0.002746969 0.0008081935 0.0007694419 0.00550255 -0.008630911 0.0006062931 0.0068933573 0.0021813295 0.0010798875 -0.009366349 0.008471645 -0.006258249 -0.0029761735 0.0035168754 -0.00078163494 0.0014152499 0.0017921324 -0.006839617 -0.009737293 0.009092817 0.0062128166 -0.00694695 0.0033956417 0.00017217748 0.004755041 -0.0071203653 0.004067516 0.004303939 0.009927 -0.0045391554 -0.0014395243 -0.0073114103 -0.009704934 -0.009090646 -0.0010375449 -0.0065315044 0.0048550633 -0.006148244 0.0026037877 0.000752482 -0.0034296552 -0.00092229253 0.010017935 0.009206015 -0.004494388 0.009070265 -0.0055859834 0.0059493524 -0.0030818144 0.0034673577 0.003029479 0.0069394265 -0.0023470228 0.008820008 0.0075530927 -0.009551933 -0.008064042 -0.007652859 0.0029148757 -0.0027951996 -0.00694831 -0.008136711 0.008356287 0.0019903474 -0.00933717 -0.004817203 0.0031394493 -0.0046995636 0.005327329 -0.0042287502 0.0027155946 -0.008033582 0.0062630265 0.0047997306 0.00079031993 0.0029888113
|
7 |
+
common -0.008722234 0.0021272295 -0.0008539916 -0.009321866 -0.0094246445 -0.001412531 0.0044288053 0.00372704 -0.006505282 -0.006894708 -0.0049991854 -0.0023061878 -0.007229156 -0.009607243 -0.0027377736 -0.008360431 -0.0060269493 -0.005675304 -0.00234906 -0.0017278373 -0.008954683 -0.000731004 0.008155364 0.007693106 -0.007208155 -0.003644954 0.0031189725 -0.009568674 0.0014795078 0.0065395026 0.0057490384 -0.008770905 -0.0045228535 -0.008156553 4.5400484e-05 0.00927559 0.005980464 0.0050585535 0.0050439127 -0.0032448657 0.009562716 -0.0073605715 -0.0072781076 -0.002255642 -0.00077679846 -0.0032283778 -0.00060498127 0.007476424 -0.00070291053 -0.0016193221 0.002749461 -0.008367007 0.0078366995 0.008528508 -0.009591924 0.0024459555 0.009891981 -0.007673955 -0.006969234 -0.0077365288 0.008389148 -0.00067644875 0.009162579 -0.008137346 0.0037369097 0.0026538277 0.0007320811 0.002340243 -0.007473436 -0.009367513 0.0023810826 0.0061679846 0.007993824 0.005740968 -0.00078188477 0.008307063 -0.009312772 0.0033975116 0.00027130058 0.003872196 0.007375048 -0.0067289495 0.005584901 -0.0095183 -0.0008194822 -0.008691651 -0.0050952802 0.009296191 -0.0018460032 0.0029113942 0.009088126 0.008946764 -0.008196811 -0.0030016953 0.009896215 0.005113277 -0.0015862831 -0.008699891 0.0029696936 -0.0066840183
|
8 |
+
sequence 0.008134779 -0.0044588344 -0.0010699655 0.001010431 -0.00018677961 0.0011458534 0.0061133304 -1.2402037e-05 -0.0032534893 -0.0015101052 0.0058955555 0.0015073137 -0.0007181427 0.009341042 -0.004917502 -0.0008413052 0.009177319 0.0067567485 0.0015022643 -0.0088886535 0.0011522508 -0.0022903979 0.009365224 0.0012041465 0.0014943897 0.0024040388 -0.0018358674 -0.004996856 0.00023002276 -0.0020175653 0.0066060103 0.008935089 -0.0006746635 0.0029776676 -0.0061099143 0.0017025766 -0.006924371 -0.008690522 -0.005899618 -0.008961226 0.0072769034 -0.005776607 0.00827455 -0.007233702 0.003422895 0.009676102 -0.0077943387 -0.009949275 -0.0043248134 -0.0026828882 -0.0002740396 -0.008833413 -0.008620106 0.0027985822 -0.008205106 -0.009067738 -0.0023404285 -0.00863584 -0.007056119 -0.008398832 -0.0003011976 -0.0045611723 0.006630901 0.0015288803 -0.0033471577 0.006116343 -0.0060124504 -0.004648673 -0.0072044823 -0.0043340866 -0.0018032556 0.00649206 -0.0027680297 0.004921421 0.006912646 -0.007459126 0.004573438 0.006129695 -0.002956148 0.0066218316 0.006121442 -0.0064460207 -0.0067676785 0.002543585 -0.0016248615 -0.006062931 0.009498339 -0.005135456 -0.006549685 -0.000118091535 -0.002699267 0.00044816377 -0.0035289875 -0.00041692218 -0.00070437486 0.00083035015 0.0081978375 -0.005737508 -0.0016556873 0.005569238
|
9 |
+
bru18 0.008155276 -0.0044185193 0.008987652 0.008259665 -0.0044238693 0.00031090993 0.004277394 -0.0039252234 -0.0055654007 -0.006509729 -0.0006656875 -0.00030213682 0.004489389 -0.0024855223 -0.00015437756 0.0024471143 0.0048732683 -2.8606542e-05 -0.0063628056 -0.009279111 1.8654398e-05 0.006667726 0.0014650559 -0.0089674555 -0.007945727 0.006548857 -0.0037690091 0.006254232 -0.0067004655 0.008482541 -0.0065189763 0.0032740948 -0.001067833 -0.0067885593 -0.0032949874 -0.0011434925 -0.005471747 -0.001204045 -0.0075744605 0.0026601462 0.009080238 -0.0023750134 -0.0009867329 0.0035252234 0.008680149 -0.0059299506 -0.006889695 -0.002942458 0.00913801 0.0008666254 -0.008663911 -0.001442217 0.009477263 -0.0075691855 -0.0053729587 0.009308613 -0.008970956 0.0038234547 0.00065334333 0.0066515543 0.008311967 -0.002862157 -0.003982641 0.008891435 0.0020839446 0.0062542376 -0.009450494 0.0095988605 -0.0013514485 -0.006062315 0.0029950105 -0.0004512243 0.0047055846 -0.0022705523 -0.004145877 0.0022992992 0.008370594 -0.004990823 0.0026696166 -0.00798221 -0.0067810714 -0.000469271 -0.008768882 0.0027844147 0.0015907697 -0.0023179457 0.005011737 0.009743466 0.008472866 -0.001870301 0.0020416898 -0.0039901678 -0.008234559 0.0062697986 -0.0019247098 -0.00066059735 -0.0017619281 -0.004536765 0.004069 -0.0042896206
|
10 |
+
bru50 -0.009579504 0.008948466 0.0041579367 0.00923892 0.006649052 0.0029269105 0.009801864 -0.0044190143 -0.0068119396 0.004226486 0.0037328962 -0.005664456 0.009715384 -0.0035591167 0.009558758 0.00083636935 -0.006334789 -0.0019748765 -0.007390546 -0.002990235 0.0010405012 0.009480547 0.009361016 -0.0065955063 0.0034724285 0.0022746115 -0.0024764987 -0.009228658 0.0010185506 -0.008164371 0.0063289437 -0.0058100903 0.005530614 0.009826734 -0.00015984276 0.0045368825 -0.0018012718 0.0073676347 0.0039300686 -0.0090082595 -0.0023973046 0.0036249864 -0.00010732573 -0.0011888575 -0.0010430571 -0.0016724848 0.00059902505 0.0041630277 -0.004250072 -0.0038341933 -5.2427928e-05 0.00026678806 -0.00017553278 -0.0047934647 0.0043008197 -0.002173452 0.0020970574 0.00065915886 0.005959963 -0.0068526124 -0.00680708 -0.004473089 0.009448878 -0.001590459 -0.009438289 -0.000534792 -0.0044530216 0.0060103727 -0.009585406 0.002857136 -0.009246552 0.001258808 0.0059965253 0.0074065947 -0.007623657 -0.0060443347 -0.006831209 -0.007910946 -0.009496376 -0.0021281417 -0.0008362788 -0.007265241 0.0067816544 0.0011141741 0.0058228294 0.0014675015 0.00078702695 -0.007366497 -0.0021715113 0.0043177926 -0.005089294 0.001137756 0.0028883398 -0.0015285894 0.009943532 0.008348668 0.0024183327 0.007110643 0.005890512 -0.005592114
|
11 |
+
vietnam -0.005153963 -0.0066644135 -0.007776157 0.0083126435 -0.0019782323 -0.006856599 -0.004155673 0.0051580225 -0.0028790692 -0.0037560624 0.0016262402 -0.00278304 -0.001570952 0.0010760438 -0.002967586 0.008515032 0.003917556 -0.009953211 0.0062494674 -0.0067655 0.00076895714 0.0043992978 -0.005096968 -0.0021128112 0.00809259 -0.0042428537 -0.0076304777 0.009258844 -0.0021577128 -0.004717085 0.008580298 0.004269408 0.004324098 0.009280228 -0.008452614 0.0052631963 0.0020472223 0.004193831 0.0016919046 0.004460046 0.0044873925 0.0060984488 -0.0032084621 -0.0045590503 -0.0004232687 0.002529075 -0.0032731881 0.006051339 0.0041546253 0.00776509 0.002568826 0.008108382 -0.0013972289 0.008070817 0.003707151 -0.008045609 -0.00393531 -0.0024772724 0.004889826 -0.00087688275 -0.00282919 0.007839672 0.009338199 -0.0016121961 -0.0051723607 -0.0046861414 -0.0048465827 -0.0095901145 0.0013706182 -0.0042283125 0.002539541 0.0056244545 -0.00406352 -0.009583576 0.0015531465 -0.006689678 0.0025049727 -0.0037749638 0.007073151 0.00063951715 0.0035553342 -0.0027433916 -0.001711565 0.007655947 0.0014000075 -0.005851 -0.007834303 0.0012315387 0.006458937 0.0055561876 -0.00897213 0.008598417 0.0040550055 0.007476387 0.00975736 -0.007282407 -0.009030263 0.0058277464 0.009392481 0.0034955258
|
12 |
+
sample 0.007100903 -0.0015709094 0.007947078 -0.00948947 -0.00802812 -0.006650821 -0.004002562 0.00500194 -0.0038224515 -0.008330948 0.00841617 -0.0037529538 0.008619977 -0.004892141 0.003931126 0.004920354 0.0023956115 -0.0028135795 0.0028564015 -0.008257614 -0.0027645228 -0.0026008752 0.007249391 -0.0034709626 -0.0066022277 0.0043369113 -0.0004823991 -0.0035912786 0.006893536 0.003869671 -0.0038965137 0.0007677057 0.009145668 0.0077625574 0.0063656354 0.004670941 0.0023901698 -0.0018358309 -0.006370667 -0.00030689163 -0.0015674513 -0.00057719386 -0.0062623145 0.0074473424 -0.0066001806 -0.007243944 -0.0027626618 -0.0015170419 -0.007635178 0.0006969715 -0.005330137 -0.0012829994 -0.007370956 0.0019601034 0.003276234 -1.4737604e-05 -0.005451358 -0.001723771 0.00709824 0.003738 -0.008888436 -0.0034084066 0.0023648455 0.0021412992 -0.009477984 0.004583573 -0.008656226 -0.007383396 0.0034825006 -0.0034719554 0.0035707187 0.008896884 -0.003571185 0.009332037 0.0017215977 0.009857596 0.005704204 -0.009146731 -0.0033407472 0.0065290304 0.0055978918 0.008714949 0.0069304765 0.008049887 -0.009821734 0.004303451 -0.0050309277 0.0035138857 0.0060621244 0.0043927776 0.007520648 0.0014953684 -0.0012639741 0.0057787485 -0.0056348047 4.0551466e-05 0.009468461 -0.005486985 0.0038199269 -0.008121091
|
13 |
+
collected 0.0097750295 0.008170629 0.0012814446 0.0051154387 0.0014172737 -0.006454876 -0.0014259414 0.0064561926 -0.004619688 -0.0039992593 0.004923175 0.0027045405 -0.0018415204 -0.0028716852 0.006021755 -0.005721393 -0.003250512 -0.0064803455 -0.0042360183 -0.008592084 -0.004467861 -0.008505252 0.0013975133 -0.008609542 -0.009919709 -0.008202052 -0.0067797694 0.006683116 0.0037784956 0.0003495915 -0.002959815 -0.007438984 0.0005348175 0.0005005026 0.00019596443 0.0008583165 0.00078985846 -5.4285138e-05 -0.008013045 -0.005872034 -0.00837931 -0.0013207265 0.0018039295 0.0074345516 -0.001966708 -0.0023440684 0.009481904 7.425008e-05 -0.0023982543 0.008607863 0.0026964454 -0.0053582233 0.0065950346 0.0045082304 -0.0070585674 -0.00031050213 0.00083163293 0.005739447 -0.0017207591 -0.0028131874 0.0017429565 0.00085032795 0.0012085037 -0.002637083 -0.0060016937 0.007339091 0.0075857476 0.00830421 -0.008602928 0.0026385786 -0.0035621128 0.0096288975 0.0029010975 0.004643974 0.0023910597 0.006626162 -0.005746352 0.007899223 -0.0024186398 -0.0045691207 -0.0020768652 0.009735589 -0.0068560173 -0.0021970137 0.006994984 -4.366915e-05 -0.0062879827 -0.006398747 0.008941079 0.0064397687 0.004773856 -0.003261329 -0.009269935 0.0038002136 0.0071752095 -0.0056398017 -0.007860231 -0.0029721109 -0.0049388385 -0.0023143636
|
14 |
+
europe -0.0019466967 -0.005264445 0.009446078 -0.009301849 0.00450806 0.005410841 -0.0014122794 0.009008321 0.009883694 -0.0054709506 -0.0060238987 -0.006749262 -0.007891144 -0.0030501 -0.00559189 -0.008350158 0.000785714 0.002999436 0.0064088805 -0.0026336086 -0.0044599404 0.0012484614 0.00038998463 0.008114584 0.00018636887 0.0072303875 -0.008259172 0.008436813 -0.0018950498 0.008705898 -0.007616939 0.0017924334 0.0010528992 4.4615095e-05 -0.005109563 -0.009249746 -0.0072665187 -0.007951877 0.0019136231 0.00048003704 -0.0018163731 0.007123826 -0.0024782037 -0.0013449806 -0.008898934 -0.0099250255 0.008953352 -0.0057566464 -0.006378906 0.0052002883 0.0066733453 -0.0068328637 0.000956345 -0.0060142023 0.0016413335 -0.004295812 -0.0034417375 0.0021831726 0.008657248 0.0067267795 -0.00967649 -0.0056275628 0.007884859 0.0019889344 -0.0042598336 0.0006024022 0.009526292 -0.0011015745 -0.009430234 0.0016114928 0.0062343916 0.00628738 0.0040935944 -0.0056507527 -0.000374705 -4.9610684e-05 0.004579015 -0.0080420235 -0.008019654 0.0002663556 -0.008607854 0.005816331 -0.00042231655 0.00997148 -0.0053460747 -0.00048954826 0.0077552027 -0.004073562 -0.0050113807 0.0015921831 0.0026467363 -0.0025611357 0.006453244 -0.0076659652 0.003398472 0.00049256504 0.008736541 0.0059848153 0.006820848 0.007819741
|
15 |
+
ancient -0.00949331 0.009558393 -0.0077741044 -0.0026378995 -0.0048897555 -0.0049655624 -0.008022211 -0.007766241 -0.0045622233 -0.0012816157 -0.0051147 0.0061208857 -0.009519694 -0.005296118 0.009434444 0.0069931676 0.0076746074 0.0042455657 0.0005105317 -0.0060022003 0.006030395 0.002638317 0.007692142 0.0063923756 0.0079497155 0.008663229 -0.009898174 -0.006753931 0.0013303582 0.0064388 0.0073839277 0.0055065546 0.007657052 -0.0051452103 0.006578382 -0.004109781 -0.009049926 0.009156881 0.0013312489 -0.0027684697 -0.0024686211 -0.004237798 0.004802247 0.00442113 -0.0026455545 -0.0073452652 -0.0035828727 -0.00034474322 0.006112652 -0.0028318586 -0.00011603545 0.0008713841 -0.007088451 0.0020616641 -0.0014378024 0.0028043352 0.0048393123 -0.0013679614 -0.0027919079 0.0077378284 0.005049118 0.006718327 0.0045309924 0.00867961 0.0074680797 -0.0010581953 0.008750674 0.0046186065 0.0054406407 -0.0013790869 -0.0020325198 -0.0044157715 -0.008505952 0.0030342783 0.008892043 0.0089222565 -0.0019243953 0.0060931933 0.0037896668 -0.0043041655 0.002026212 -0.005454141 0.008199508 0.005422219 0.003183278 0.0041012214 0.008660769 0.007268954 -0.0008326238 -0.0070764753 0.008396081 0.0072427383 0.0017482204 -0.0013339228 -0.0058783586 -0.004530154 0.008643081 -0.003131084 -0.006341318 0.009878559
|
16 |
+
neanderthal 0.007692736 0.009126856 0.001134214 -0.008323363 0.008438394 -0.0036978398 0.005743373 0.0044079996 0.0096743805 -0.009301011 0.009201668 -0.009297726 -0.0068989955 -0.009099583 -0.0055382987 0.0073707746 0.009167804 -0.0033190295 0.0037136457 -0.0036417823 0.007886165 0.0058672884 4.5112392e-06 -0.0036315187 -0.0072244583 0.0047761244 0.0014634884 -0.002615084 0.007832942 -0.004045295 -0.00913638 -0.0022702827 0.00011177889 -0.006659164 -0.0054871286 -0.008484606 0.00924395 0.0074312175 -0.00030530593 0.0073675984 0.0079630045 -0.0007988404 0.0066030715 0.0037836921 0.0050928146 0.0072574555 -0.004751798 -0.0021930316 0.00087973 0.0042327694 0.0033078827 0.0050869007 0.004582786 -0.008444151 -0.0031969673 -0.007233252 0.009679768 0.0049946425 0.0001599608 0.0041068383 -0.0076482734 -0.0062929546 0.003092239 0.006544919 0.0039503933 0.006035828 -0.0019895614 -0.0033235473 0.00020525315 -0.0031931365 -0.005507259 -0.0077802544 0.0065467777 -0.0010795805 -0.0018928167 -0.007799526 0.009349405 0.00087477046 0.0017788016 0.0024914553 -0.0073950374 0.0016234348 0.0029714536 -0.008580277 0.0049522887 0.0024255016 0.0074964412 0.0050449395 -0.0030210917 -0.0071717766 0.007105708 0.0019140064 0.005210298 0.0063858717 0.0019259832 -0.0061174775 -5.528207e-06 0.008260976 -0.0060965912 0.009431074
|
17 |
+
modern -0.0071792696 0.0042354544 0.00216289 0.007438057 -0.0048900596 -0.0045788498 -0.0060949842 0.0033097882 -0.004507435 0.008506253 -0.0042799306 -0.009108578 -0.0047961376 0.0064152437 -0.006351414 -0.0052630682 -0.007296127 0.006024725 0.003365447 0.0028487756 -0.0031356772 0.00602019 -0.0061529716 -0.001984372 -0.0059886468 -0.0009987217 -0.0020279228 0.008489572 9.179515e-05 -0.0085772425 -0.0054273363 -0.0068765874 0.0026914866 0.00946441 -0.0058075436 0.008274624 0.008538083 -0.007054826 -0.008883825 0.009470304 0.008378029 -0.0046964334 -0.0067229234 0.007853816 0.003754884 0.008087255 -0.0075793806 -0.009526273 0.0015759452 -0.009809055 -0.004886255 -0.003462314 0.009610498 0.008620381 -0.002831389 0.005837147 0.008235405 -0.002257783 0.009542199 0.0071611865 0.0020309114 -0.0038430467 -0.005072538 -0.00304804 0.007877576 -0.0061799455 -0.0029184332 0.009190523 0.003460949 0.0060627563 -0.008025261 -0.00075433304 0.0055211782 -0.0046972577 0.0074892025 0.009333807 -0.00041072394 -0.0020574103 -0.00060545607 -0.0057792794 -0.0083910655 -0.0014910942 -0.0025447267 0.0043934747 -0.006866489 0.00542165 -0.006739068 -0.0078106844 0.008480591 0.008917766 -0.0034737175 0.0034897032 -0.005797486 -0.008738294 -0.0055089584 0.0067478465 0.0064329007 0.009427363 0.007059985 0.0067415633
|
18 |
+
human 0.0013073076 -0.009817197 0.0046000797 -0.00054215814 0.0063516907 0.0017917434 -0.0031376705 0.00779152 0.0015605913 4.5087592e-05 -0.004629277 -0.008477088 -0.0077653346 0.00868444 -0.0089293 0.009021215 -0.009282701 -0.00026340262 -0.0019013402 -0.008945062 0.008634705 0.006775237 0.0030073978 0.00484689 0.000119797296 0.009438227 0.007017406 -0.009846283 -0.0044378787 -0.0012810889 0.0030511408 -0.0043373024 0.0014413317 -0.007862512 0.002772104 0.0047001 0.004937028 -0.0031820575 -0.008430869 -0.009233454 -0.00072350266 -0.007335406 -0.0068239835 0.006137866 0.0071648457 0.0021028868 -0.00790615 -0.0057202103 0.008053211 0.0039317366 -0.0052275606 -0.007412702 0.00076265965 0.0034572822 0.002076003 0.0031028383 -0.0056280685 -0.0099016195 -0.0070258062 0.00023322599 0.0046109683 0.004535595 0.0018992841 0.0051839855 -0.000116945404 0.004136494 -0.009110944 0.0077172276 0.0061438708 0.0051303217 0.0072363587 0.0084579345 0.00074768433 -0.0017087719 0.0005303956 -0.009314834 0.008429295 -0.0063797934 0.008425091 -0.0042409054 0.0006248087 -0.009168093 -0.009569658 -0.007833339 -0.0077458574 0.00037962993 -0.0072201644 -0.004963075 -0.0052754995 -0.004289475 0.0070301695 0.004834569 0.008708495 0.0070971223 -0.0056847483 0.007253502 -0.009290819 -0.0025857396 -0.007757146 0.0042008474
|
19 |
+
genome 0.0018013249 0.0070483726 0.002941503 -0.006984167 0.0077269375 -0.005990631 0.008982948 0.0029859466 -0.0040263417 -0.0046959417 -0.004423949 -0.006166649 0.009397486 -0.0026410713 0.00779025 -0.009682492 0.0021134273 -0.001217051 0.007545118 -0.009060286 0.007431912 -0.005112224 -0.006022511 -0.0056468663 -0.0033655176 -0.0034046597 -0.0031906026 -0.007475777 0.0007148267 -0.0005725245 -0.0016790004 0.0037438255 -0.00763313 -0.0032234066 0.00514847 0.00855509 -0.009791086 0.0071872775 0.0052953 -0.003874173 0.008570203 -0.009222292 0.0072385296 0.0053781155 0.0012898272 -0.0051951176 -0.004179599 -0.003369767 0.0015944163 0.001581598 0.007396833 0.0099602975 0.008836587 -0.004008733 0.009636086 -0.00063042255 0.0048575792 0.0025363516 -0.0006256454 0.0036644523 -0.005330011 -0.0057551167 -0.007577021 0.0019176035 0.006513916 0.00090115983 0.0012633507 0.0031810037 0.008123854 -0.007687061 0.0022752027 -0.007455608 0.003715618 0.009514587 0.0075186947 0.006441567 0.008026117 0.006552105 0.0068467325 0.00869257 -0.0049556913 0.009209661 0.0050575286 -0.0021248695 0.008474546 0.005080482 0.009641399 0.0028190457 0.009884555 0.001195692 0.009130684 0.0035973836 0.006580412 -0.00361116 0.0068057566 0.007250423 -0.002115621 -0.0018615718 0.003625693 -0.0070385
|
20 |
+
shows 0.009741375 -0.009785563 -0.006502033 0.0027767855 0.0064354893 -0.005370729 0.0027519849 0.009131747 -0.006819064 -0.0061066505 -0.0049928115 -0.00368126 0.0018522884 0.009683641 0.00644354 0.00039165124 0.0024744181 0.00844649 0.009138178 0.005629969 0.005943013 -0.007629522 -0.0038295696 -0.005683565 0.0061836103 -0.00225932 -0.008786562 0.0076284255 0.008406309 -0.0033179314 0.009119112 -0.00073907804 -0.0036286868 -0.0003802314 0.00019241076 -0.0035078088 0.0028134247 0.005731432 0.006873956 -0.008905951 -0.0021951643 -0.0054816343 0.0075234827 0.0065075015 -0.0043688817 0.002324414 -0.0059516523 0.00023538349 0.00945961 -0.0026105444 -0.0051873005 -0.0074033006 -0.0029152564 -0.0008664178 0.0035291065 0.009743326 -0.0033921245 0.001903681 0.009692432 0.0015337794 0.0009810732 0.009802843 0.00930645 0.007710903 -0.006179333 0.009991138 0.005857104 0.009073708 -0.002001237 0.0033512171 0.0068392376 -0.0038913293 0.006648019 0.0025668114 0.009319553 -0.0030298685 -0.0031094935 0.0062168743 -0.00908894 -0.0072543155 -0.006503641 -0.00074380165 -0.002362113 0.0068256087 0.009239293 -0.00091146474 0.0014132133 0.002020571 -0.0020174456 -0.008035576 0.007445874 -0.004299319 0.004580612 0.009090945 0.0030486963 0.00313993 0.0040727276 -0.0027017219 0.0038345656 0.00033530922
|
21 |
+
variation 0.005626712 0.005497371 0.0018291199 0.0057494068 -0.008968078 0.0065593575 0.009225992 -0.0042071473 0.0016075504 -0.0052338815 0.0010582185 0.0027701687 0.008160736 0.00054401276 0.0025570584 0.001297735 0.008402523 -0.0057077026 -0.00626183 -0.0036275184 -0.0023005498 0.005041063 -0.008120357 -0.0028335357 -0.008197427 0.00514971 -0.0025680638 -0.009067107 0.0040717293 0.009017323 -0.0030376601 -0.0058385395 0.0030198884 -0.00043584823 -0.009979436 0.008417704 -0.0073388875 -0.004930407 -0.002657081 -0.0054523144 0.00171651 0.009712814 0.0045722723 0.008088603 -0.00047045827 0.0006449234 -0.002668352 -0.008779561 0.0034313034 0.0020933736 -0.009421854 -0.004968437 -0.009734099 -0.0057197916 0.0040645422 0.008642861 0.00411165 0.0023884643 0.008144778 -0.0011192096 -0.0013977134 -0.008746823 -0.00012579202 -0.0025675725 0.00038607715 0.007279662 -0.0070414604 -0.0039464748 -0.0066646053 -0.0035441148 -0.0033158315 0.002137121 0.0033281683 -0.004957187 -0.0045462907 0.0011386942 0.0054534827 0.0053736498 -0.0029685367 -0.0042665256 -0.005616647 -0.00054498314 0.001946373 0.0015253461 0.0073525296 -0.0027333724 -6.592393e-05 -0.0055276332 -0.0011700654 -0.0077119637 -0.0009593296 0.0013096749 -0.008594744 0.0087485835 -0.009207866 -0.009624677 -0.008511624 0.0073132683 0.0054655685 0.009249462
|
22 |
+
haplogroup 0.0025659278 0.00085168 -0.0025371916 0.00934742 0.0028080416 0.0041162586 -0.0011815964 0.00096541416 0.0066110776 -0.00074895076 0.0033208325 -0.00070219487 0.0052740807 0.003645613 0.0026175152 -0.0053456044 -0.004693721 0.004352339 -0.0059164464 -0.00020070269 -0.0006396672 0.0034715144 -0.008427317 0.0088428045 -0.0014485243 -0.005307692 0.0040584584 -0.001898596 -0.007778139 -0.0044734394 -0.0003679351 -0.0089815045 0.0005416724 0.002407686 -0.003227299 0.0025667753 0.0024930644 0.009990179 0.0014140693 0.0020159276 0.0027784512 -0.0020868885 -0.008718105 0.008073382 -0.0019698895 -0.009723993 -0.006550278 -0.0039781313 0.003948964 0.0050270366 0.0061098747 -0.006815141 0.00066107995 -0.0028290635 -0.0052407067 0.006984182 0.0039222264 -0.003121762 -0.008263934 -0.0051569464 -0.00065567193 0.0078113875 0.006122021 -0.008424067 -0.0096058855 0.0071855173 -0.0022900787 -0.0036282074 0.005704672 -0.0058300486 0.005136189 -0.00020829153 -0.0068513798 -0.00030139415 0.006364283 0.009325248 0.0022419153 0.0050703404 -0.0050120936 -0.0008110871 -0.005373588 0.0011743606 -0.0017981603 -0.0036161384 -0.0070382343 0.009639485 0.003012655 -0.0022897385 -0.0041911877 0.0076894285 -0.0064663296 0.0031200873 0.0008309826 0.008321212 0.0068888706 -0.0028947534 0.002593874 -0.0016730811 -0.009431767 -0.0026270088
|
23 |
+
h 0.0013225824 0.0065497826 0.009982806 0.009062454 -0.0079781795 0.0065080435 -0.0057147983 -0.0009299061 0.00047654507 0.0065626903 0.0044563343 0.0045750956 0.0095022535 0.00038496728 -0.0060190535 -0.006347197 0.0064362343 -0.005219293 -0.002869563 0.004042792 -0.002286449 -0.006022882 -0.0023193487 0.0012384101 0.0021826315 0.0061027543 -0.005193723 0.003081824 0.0072158594 0.0022087328 0.0054155486 -0.004879429 0.0061283903 -0.007640156 0.0034881763 -0.009306421 -0.0025874602 -0.00905658 -0.0016061858 -0.005364485 -0.0039271545 0.0011356737 0.002771372 -0.0014860439 -0.008151553 -0.0059441784 0.00080055697 -0.0039708167 -0.009422841 -0.0007733177 0.0066586556 0.005949332 -0.0099333245 0.0030846666 -0.006018299 -0.009179041 0.00015740465 -0.0003979007 -0.006993792 -0.0063003623 -0.0024212876 0.0071041975 -0.0074873487 0.0077126683 -0.000499351 0.001135528 0.009489626 0.0047690077 -0.0035878688 0.00373115 0.0035563034 0.0063642766 7.750339e-05 -0.0044055916 0.001321394 -0.005388977 0.0014417345 0.004943775 0.0051506218 0.009180272 -0.0075472356 -0.005428668 0.0064623333 0.0013423576 -0.0066391225 0.0008783591 0.0027003903 -0.0025289776 -0.004963421 0.0049924683 0.009631416 -0.0073435763 -7.912599e-05 -0.0025523733 -0.0063192695 -0.001368983 -0.005227159 0.009048553 -0.005790704 0.003674939
|
24 |
+
is -0.00023357147 0.004226683 0.0021067455 0.009996419 0.0006458492 -0.005461563 -0.0011838758 0.0020920378 -0.0033855627 -0.007853136 -0.005604329 -0.0067612384 0.006366702 0.0039265845 0.008232181 0.0065088123 -0.0061183744 0.002733512 0.008466464 0.0015833755 0.0030677342 0.0058010546 -0.008839754 0.009125629 0.0068226005 0.008512217 -0.0082233 0.0061861346 0.006626654 -0.0013528146 -0.0062799496 0.0053081806 -0.006868758 -0.005337174 0.0035091531 0.008081314 0.008700704 -0.0043939846 -0.0091931205 0.009603682 0.006290027 -0.0039766026 -0.008465367 -0.004691139 -0.0039542373 -0.0032808431 0.0008109401 -0.00030902817 -0.0031103012 -0.005998526 0.009428418 -0.004739384 -0.007274209 0.0076703983 0.0025008747 0.0086274175 -0.004468981 -0.0069012893 0.0009802914 -0.0011801491 -0.009394523 -0.0015968346 0.0030780574 0.006576642 0.0068287384 0.0032347892 -0.0044282703 -0.0018157784 -0.0039494233 0.0057785274 -0.006343468 0.002114367 -0.0013383601 -0.0057999003 -0.007236314 0.0058711045 -0.008345587 -0.00067066104 0.0028193784 0.00773521 -0.007315293 0.003294973 0.009805078 -0.0069755646 -0.003540081 0.005130921 0.005245436 0.0016209023 0.00797557 0.00082546985 0.0018813204 -0.0015988776 -0.008149317 0.0032639706 0.0019852505 -0.008730082 -0.0006569945 7.3046285e-05 -2.6318648e-06 0.008703764
|
25 |
+
mitochondrial -0.002508221 -0.0059015388 0.007485539 -0.007257687 -0.008965709 -0.0017888069 -0.008367486 0.00039139786 0.0019467709 -0.0024699308 -0.00644677 -0.00032192905 -0.0010975264 0.0034935323 0.008127049 0.0058537317 0.008440359 -0.0089677265 0.00944024 -0.002368706 0.008696626 0.0023858226 0.0035850583 -0.0095805535 -0.009488111 0.008984071 -0.002896514 0.0028174375 0.0064166263 -0.00029972216 0.00971954 -0.0010352092 -0.009671927 -0.0070548807 -0.0010439103 -0.008674508 0.0074211163 0.0036188734 -0.00874913 0.008480371 0.008929614 0.0058477637 0.0069070626 -0.009568968 0.0004927428 -0.009223568 -0.0036663204 0.00025142074 -0.0002807199 0.0014672013 0.0032786338 0.0021258853 0.005320648 0.0075189634 -0.005886681 0.007957336 0.005991082 0.009785411 0.0046226517 -0.0033269909 -0.0037473391 -0.00062982703 -0.0016548736 0.009871284 0.0011211695 0.00400867 0.0034179776 -0.008850507 0.006720342 0.008190563 -0.0016650181 0.0023356378 -0.0064802184 -0.006126035 0.0082164975 -0.0030429186 0.0067422306 0.001552869 -0.0019822652 0.0030546081 -0.004023311 -0.0017839139 0.0013798403 0.004887597 -0.0014078929 0.0006583137 -0.007930928 0.00949345 -0.008762073 0.007072499 0.0039040898 -0.0069980817 -0.005295161 -0.007937933 -0.0051285303 0.00707022 0.009641066 0.0021544741 0.0006394228 0.009524309
|
NER/word2Vec/testModel/test_model_updated.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b1b785c79991b857b364ee9863985eaf845087efb1aa40a6b9cfae3b2a50012
|
3 |
+
size 133
|
NER/word2Vec/word2vec.py
CHANGED
@@ -1,3 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
'''WORD TO VECTOR'''
|
2 |
import pandas as pd
|
3 |
import json
|
@@ -10,8 +381,11 @@ from gensim.test.utils import common_texts
|
|
10 |
from gensim.models.word2vec import Word2Vec
|
11 |
from gensim.scripts.glove2word2vec import glove2word2vec
|
12 |
from gensim.test.utils import datapath, get_tmpfile
|
|
|
|
|
13 |
import sys
|
14 |
import subprocess
|
|
|
15 |
# can try multiprocessing to run quicker
|
16 |
import multiprocessing
|
17 |
import copy
|
@@ -32,18 +406,19 @@ class word2Vec():
|
|
32 |
def __init__(self, nameFile=None, modelName=None):
|
33 |
self.nameFile = nameFile
|
34 |
self.modelName = modelName
|
|
|
|
|
35 |
def spacy_similarity(self, word):
|
36 |
# when use word2vec, try medium or large is better
|
37 |
# maybe try odc similarity?
|
38 |
-
|
39 |
-
doc = nlp(word)
|
40 |
for token1 in doc:
|
41 |
for token2 in doc:
|
42 |
print(token1.text, token2.text, token1.similarity(token2))
|
43 |
pass
|
44 |
# clean text before transform to corpus
|
45 |
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
46 |
-
cl = cleanText.cleanGenText()
|
47 |
#cl = cleanGenText()
|
48 |
output = ""
|
49 |
alreadyRemoveDoi = False
|
@@ -51,7 +426,7 @@ class word2Vec():
|
|
51 |
# remove DOI
|
52 |
if doi != None and doi in oriText:
|
53 |
if alreadyRemoveDoi == False:
|
54 |
-
newWord = cl.removeDOI(word,doi)
|
55 |
if len(newWord) > 0 and newWord != word:
|
56 |
alreadyRemoveDoi = True
|
57 |
word = newWord
|
@@ -59,13 +434,13 @@ class word2Vec():
|
|
59 |
# split the sticked words
|
60 |
#word = cl.splitStickWords(word)
|
61 |
# remove punctuation
|
62 |
-
word = cl.removePunct(word,True)
|
63 |
# remove URL
|
64 |
-
word = cl.removeURL(word)
|
65 |
# remove HTMLTag
|
66 |
-
word = cl.removeHTMLTag(word)
|
67 |
# remove tab, white space, newline
|
68 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
69 |
# optional: remove stopwords
|
70 |
#word = cl.removeStopWords(word)
|
71 |
if len(word)>0:
|
@@ -75,16 +450,18 @@ class word2Vec():
|
|
75 |
cleanOutput = ""
|
76 |
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
77 |
if len(allText) > 0:
|
78 |
-
corpusText = allText
|
79 |
-
for pos in range(len(corpusText
|
80 |
-
|
81 |
-
|
82 |
for line in lines.split("\n"):
|
83 |
if remove in line: line = line.replace(remove, "")
|
84 |
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
85 |
cleanOutput += clean_text + "\n"
|
86 |
cleanOutput += "\n\n"
|
87 |
return cleanOutput
|
|
|
|
|
88 |
def tableTransformToCorpusText(self, df, excelFile=None):
|
89 |
# PDF, Excel, WordDoc
|
90 |
#cl = cleanText.cleanGenText()
|
@@ -119,10 +496,10 @@ class word2Vec():
|
|
119 |
try:
|
120 |
df = pd.ExcelFile(excelFile)
|
121 |
except:
|
122 |
-
if
|
123 |
-
df = pd.read_excel(
|
124 |
else:
|
125 |
-
df = pd.read_excel(
|
126 |
sheetNames = df.sheet_names
|
127 |
output = []
|
128 |
if len(sheetNames) > 0:
|
@@ -142,7 +519,7 @@ class word2Vec():
|
|
142 |
return corpus
|
143 |
def helperRowTableToCorpus(self, textList):
|
144 |
#cl = cleanGenText()
|
145 |
-
cl = cleanText.cleanGenText()
|
146 |
stopWords = ["NaN","Unnamed:","nan"]
|
147 |
outputDF = []
|
148 |
for line in textList:
|
@@ -154,9 +531,9 @@ class word2Vec():
|
|
154 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
155 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
156 |
#word = cl.splitStickWords(word)
|
157 |
-
word = cl.removePunct(word)
|
158 |
-
word = " ".join(cl.removeStopWords(word))
|
159 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
160 |
if len(word) > 1:
|
161 |
if len(word.split(" ")) > 1:
|
162 |
for x in word.split(" "):
|
@@ -170,7 +547,7 @@ class word2Vec():
|
|
170 |
return outputDF
|
171 |
def helperColTableToCorpus(self, dfList):
|
172 |
#cl = cleanGenText()
|
173 |
-
cl = cleanText.cleanGenText()
|
174 |
stopWords = ["NaN","Unnamed:","nan"]
|
175 |
outputDF = []
|
176 |
# use the first length line as the column ref
|
@@ -186,9 +563,9 @@ class word2Vec():
|
|
186 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
187 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
188 |
#word = cl.splitStickWords(word)
|
189 |
-
word = cl.removePunct(word)
|
190 |
-
word = " ".join(cl.removeStopWords(word))
|
191 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
192 |
if len(word) > 1:
|
193 |
if len(word.split(" ")) > 1:
|
194 |
for x in word.split(" "):
|
@@ -216,21 +593,22 @@ class word2Vec():
|
|
216 |
Mouse is an animal.
|
217 |
Jerry is mouse.'''
|
218 |
texts = {}
|
219 |
-
cl = cleanText.cleanGenText()
|
220 |
#cl = cleanGenText()
|
221 |
-
|
222 |
-
|
|
|
223 |
texts["Paragraph "+str(pos)] = []
|
224 |
-
lines =
|
225 |
for line in lines.split("\n"):
|
226 |
for l in line.split("."):
|
227 |
if len(l) > 0:
|
228 |
-
cl.removeTabWhiteSpaceNewLine(l)
|
229 |
l = l.lower()
|
230 |
newL = []
|
231 |
for word in l.split(" "):
|
232 |
if len(word) > 0:
|
233 |
-
word = cl.removeStopWords(word)
|
234 |
for w in word:
|
235 |
if len(w) > 0 and w.isnumeric()==False:
|
236 |
newL.append(w)
|
@@ -239,49 +617,86 @@ class word2Vec():
|
|
239 |
if len(texts["Paragraph "+str(pos)]) == 0:
|
240 |
del texts["Paragraph "+str(pos)]
|
241 |
return texts
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
246 |
corSize = len(corpus)
|
247 |
-
|
248 |
-
if
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
return window, vector_size, sample, negative, epochs, sg
|
271 |
-
|
272 |
-
|
273 |
-
|
|
|
274 |
jsonFile = ""
|
275 |
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
|
|
|
|
|
|
276 |
cores = multiprocessing.cpu_count()
|
277 |
combinedCorpus = []
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
|
|
|
|
|
|
|
|
282 |
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
283 |
-
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
min_count=1,
|
286 |
window=window,
|
287 |
vector_size=vector_size,
|
@@ -291,43 +706,39 @@ class word2Vec():
|
|
291 |
negative=negative,
|
292 |
workers=cores-1,
|
293 |
epochs = epochs,
|
294 |
-
sg=sg)
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
w2vModel.build_vocab(combinedCorpus)
|
312 |
-
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
313 |
-
accept = True
|
314 |
-
except:
|
315 |
-
for key in jsonFile:
|
316 |
-
combinedCorpus.extend(jsonFile[key])
|
317 |
-
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
318 |
-
print("next is " + str(len(combinedCorpus)))
|
319 |
-
else:
|
320 |
-
print("no parameter to train")
|
321 |
-
break
|
322 |
-
#w2vModel.build_vocab(combinedCorpus)
|
323 |
-
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
324 |
-
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
325 |
-
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
326 |
-
w2vModel.save(saveFolder+"/"+modelName+".model")
|
327 |
-
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
328 |
-
print("done w2v")
|
329 |
-
else: print("no corpus to train")
|
330 |
#return combinedCorpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
332 |
# might not be a meaningful keyword
|
333 |
#stopWords = ["show"]
|
@@ -354,6 +765,32 @@ class word2Vec():
|
|
354 |
results.append(moreNewResult)
|
355 |
currN +=1'''
|
356 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
# adding our model into spacy
|
358 |
# this deals with command line; but instead of using it, we write python script to run command line
|
359 |
def loadWordVec(self,modelName,wordVec):
|
@@ -367,4 +804,5 @@ class word2Vec():
|
|
367 |
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
368 |
"--vectors-loc",
|
369 |
wordVec])
|
|
|
370 |
print("done")
|
|
|
1 |
+
<<<<<<< HEAD
|
2 |
+
'''WORD TO VECTOR'''
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import gensim
|
6 |
+
import spacy
|
7 |
+
from DefaultPackages import openFile, saveFile
|
8 |
+
from NER import cleanText
|
9 |
+
from gensim.models.keyedvectors import KeyedVectors
|
10 |
+
from gensim.test.utils import common_texts
|
11 |
+
from gensim.models.word2vec import Word2Vec
|
12 |
+
from gensim.scripts.glove2word2vec import glove2word2vec
|
13 |
+
from gensim.test.utils import datapath, get_tmpfile
|
14 |
+
import sys
|
15 |
+
import subprocess
|
16 |
+
# can try multiprocessing to run quicker
|
17 |
+
import multiprocessing
|
18 |
+
import copy
|
19 |
+
sys.setrecursionlimit(1000)
|
20 |
+
# creat folder word2Vec
|
21 |
+
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
|
22 |
+
# create word2vec model
|
23 |
+
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
|
24 |
+
'''Some notes for this model
|
25 |
+
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
|
26 |
+
a similar word to the word we are finding, so can we try to preprocess text so that
|
27 |
+
we make the corpus more effective and only contains the important words. Then when we
|
28 |
+
train the model, the important words will be seen as important. Or
|
29 |
+
when we already have the similar list of words, we can remove the words in there
|
30 |
+
that are stopwords/unnecessary words.'''
|
31 |
+
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
|
32 |
+
class word2Vec():
|
33 |
+
def __init__(self, nameFile=None, modelName=None):
|
34 |
+
self.nameFile = nameFile
|
35 |
+
self.modelName = modelName
|
36 |
+
def spacy_similarity(self, word):
|
37 |
+
# when use word2vec, try medium or large is better
|
38 |
+
# maybe try odc similarity?
|
39 |
+
nlp = spacy.load("en_core_web_lg")
|
40 |
+
doc = nlp(word)
|
41 |
+
for token1 in doc:
|
42 |
+
for token2 in doc:
|
43 |
+
print(token1.text, token2.text, token1.similarity(token2))
|
44 |
+
pass
|
45 |
+
# clean text before transform to corpus
|
46 |
+
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
47 |
+
cl = cleanText.cleanGenText()
|
48 |
+
#cl = cleanGenText()
|
49 |
+
output = ""
|
50 |
+
alreadyRemoveDoi = False
|
51 |
+
for word in oriText.split(" "):
|
52 |
+
# remove DOI
|
53 |
+
if doi != None and doi in oriText:
|
54 |
+
if alreadyRemoveDoi == False:
|
55 |
+
newWord = cl.removeDOI(word,doi)
|
56 |
+
if len(newWord) > 0 and newWord != word:
|
57 |
+
alreadyRemoveDoi = True
|
58 |
+
word = newWord
|
59 |
+
# remove punctuation
|
60 |
+
# split the sticked words
|
61 |
+
#word = cl.splitStickWords(word)
|
62 |
+
# remove punctuation
|
63 |
+
word = cl.removePunct(word,True)
|
64 |
+
# remove URL
|
65 |
+
word = cl.removeURL(word)
|
66 |
+
# remove HTMLTag
|
67 |
+
word = cl.removeHTMLTag(word)
|
68 |
+
# remove tab, white space, newline
|
69 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
70 |
+
# optional: remove stopwords
|
71 |
+
#word = cl.removeStopWords(word)
|
72 |
+
if len(word)>0:
|
73 |
+
output += word + " "
|
74 |
+
return output
|
75 |
+
def cleanAllTextBeforeCorpus(self, allText, doi=None):
|
76 |
+
cleanOutput = ""
|
77 |
+
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
78 |
+
if len(allText) > 0:
|
79 |
+
corpusText = allText
|
80 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
81 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
82 |
+
lines = corpusText.split("\n\n")[pos]
|
83 |
+
for line in lines.split("\n"):
|
84 |
+
if remove in line: line = line.replace(remove, "")
|
85 |
+
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
86 |
+
cleanOutput += clean_text + "\n"
|
87 |
+
cleanOutput += "\n\n"
|
88 |
+
return cleanOutput
|
89 |
+
def tableTransformToCorpusText(self, df, excelFile=None):
|
90 |
+
# PDF, Excel, WordDoc
|
91 |
+
#cl = cleanText.cleanGenText()
|
92 |
+
corpus = {}
|
93 |
+
# PDF or df
|
94 |
+
if excelFile == None:
|
95 |
+
if len(df) > 0:
|
96 |
+
try:
|
97 |
+
for i in range(len(df)):
|
98 |
+
# each new dimension/page is considered to be a sentence which ends with the period.
|
99 |
+
# each new line is a new list, and each new df is a new corpus
|
100 |
+
outputDF = []
|
101 |
+
text = df[i].values.tolist()
|
102 |
+
if len(text) > 0:
|
103 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
104 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
105 |
+
outputDF.extend(outputRowDF)
|
106 |
+
#outputDF.extend(outputColDF)
|
107 |
+
if len(outputDF) > 0:
|
108 |
+
corpus["corpus" + str(i)] = outputDF
|
109 |
+
except:
|
110 |
+
outputDF = []
|
111 |
+
text = df.values.tolist()
|
112 |
+
if len(text) > 0:
|
113 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
114 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
115 |
+
outputDF.extend(outputRowDF)
|
116 |
+
#outputDF.extend(outputColDF)
|
117 |
+
if len(outputDF) > 0:
|
118 |
+
corpus["corpus0"] = outputDF
|
119 |
+
else:
|
120 |
+
try:
|
121 |
+
df = pd.ExcelFile(excelFile)
|
122 |
+
except:
|
123 |
+
if filepath.endswith('.xls'):
|
124 |
+
df = pd.read_excel(filepath, engine='xlrd')
|
125 |
+
else:
|
126 |
+
df = pd.read_excel(filepath, engine='openpyxl')
|
127 |
+
sheetNames = df.sheet_names
|
128 |
+
output = []
|
129 |
+
if len(sheetNames) > 0:
|
130 |
+
for s in range(len(sheetNames)):
|
131 |
+
outputDF = []
|
132 |
+
with pd.ExcelFile(excelFile) as xls:
|
133 |
+
data = pd.read_excel(xls, sheetNames[s])
|
134 |
+
if sheetNames[s] != 'Evaluation Warning':
|
135 |
+
text = data.values.tolist()
|
136 |
+
if len(text) > 0:
|
137 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
138 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
139 |
+
outputDF.extend(outputRowDF)
|
140 |
+
#outputDF.extend(outputColDF)
|
141 |
+
if len(outputDF) > 0:
|
142 |
+
corpus["corpus" + str(s)] = outputDF
|
143 |
+
return corpus
|
144 |
+
def helperRowTableToCorpus(self, textList):
|
145 |
+
#cl = cleanGenText()
|
146 |
+
cl = cleanText.cleanGenText()
|
147 |
+
stopWords = ["NaN","Unnamed:","nan"]
|
148 |
+
outputDF = []
|
149 |
+
for line in textList:
|
150 |
+
outputLine = []
|
151 |
+
for words in line:
|
152 |
+
words = str(words)
|
153 |
+
if len(words) > 0:
|
154 |
+
for word in words.split(" "):
|
155 |
+
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
156 |
+
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
157 |
+
#word = cl.splitStickWords(word)
|
158 |
+
word = cl.removePunct(word)
|
159 |
+
word = " ".join(cl.removeStopWords(word))
|
160 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
161 |
+
if len(word) > 1:
|
162 |
+
if len(word.split(" ")) > 1:
|
163 |
+
for x in word.split(" "):
|
164 |
+
if len(x) > 1 and x.isnumeric()==False:
|
165 |
+
outputLine.append(x.lower())
|
166 |
+
else:
|
167 |
+
if word.isnumeric() == False:
|
168 |
+
outputLine.append(word.lower())
|
169 |
+
if len(outputLine) > 0:
|
170 |
+
outputDF.append(outputLine)
|
171 |
+
return outputDF
|
172 |
+
def helperColTableToCorpus(self, dfList):
|
173 |
+
#cl = cleanGenText()
|
174 |
+
cl = cleanText.cleanGenText()
|
175 |
+
stopWords = ["NaN","Unnamed:","nan"]
|
176 |
+
outputDF = []
|
177 |
+
# use the first length line as the column ref
|
178 |
+
for pos in range(len(dfList[0])):
|
179 |
+
outputLine = []
|
180 |
+
for line in dfList:
|
181 |
+
if pos < len(line):
|
182 |
+
words = line[pos]
|
183 |
+
words = str(words)
|
184 |
+
else: words = ""
|
185 |
+
if len(words) > 0:
|
186 |
+
for word in words.split(" "):
|
187 |
+
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
188 |
+
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
189 |
+
#word = cl.splitStickWords(word)
|
190 |
+
word = cl.removePunct(word)
|
191 |
+
word = " ".join(cl.removeStopWords(word))
|
192 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
193 |
+
if len(word) > 1:
|
194 |
+
if len(word.split(" ")) > 1:
|
195 |
+
for x in word.split(" "):
|
196 |
+
if len(x) > 1 and x.isnumeric()==False:
|
197 |
+
outputLine.append(x.lower())
|
198 |
+
else:
|
199 |
+
if word.isnumeric() == False:
|
200 |
+
outputLine.append(word.lower())
|
201 |
+
if len(outputLine) > 0:
|
202 |
+
outputDF.append(outputLine)
|
203 |
+
return outputDF
|
204 |
+
# create a corpus
|
205 |
+
def createCorpusText(self, corpusText):
|
206 |
+
'''ex: "Tom is cat. Jerry is mouse."
|
207 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
208 |
+
# the output should be like this:
|
209 |
+
'''texts = {
|
210 |
+
"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
|
211 |
+
"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
|
212 |
+
}
|
213 |
+
'''
|
214 |
+
# separate paragraph
|
215 |
+
'''Ex: Cat is an animal. Tom is cat.
|
216 |
+
|
217 |
+
Mouse is an animal.
|
218 |
+
Jerry is mouse.'''
|
219 |
+
texts = {}
|
220 |
+
cl = cleanText.cleanGenText()
|
221 |
+
#cl = cleanGenText()
|
222 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
223 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
224 |
+
texts["Paragraph "+str(pos)] = []
|
225 |
+
lines = corpusText.split("\n\n")[pos]
|
226 |
+
for line in lines.split("\n"):
|
227 |
+
for l in line.split("."):
|
228 |
+
if len(l) > 0:
|
229 |
+
cl.removeTabWhiteSpaceNewLine(l)
|
230 |
+
l = l.lower()
|
231 |
+
newL = []
|
232 |
+
for word in l.split(" "):
|
233 |
+
if len(word) > 0:
|
234 |
+
word = cl.removeStopWords(word)
|
235 |
+
for w in word:
|
236 |
+
if len(w) > 0 and w.isnumeric()==False:
|
237 |
+
newL.append(w)
|
238 |
+
if len(newL)>0:
|
239 |
+
texts["Paragraph "+str(pos)].append(newL)
|
240 |
+
if len(texts["Paragraph "+str(pos)]) == 0:
|
241 |
+
del texts["Paragraph "+str(pos)]
|
242 |
+
return texts
|
243 |
+
def selectParaForWC(self,corpus):
|
244 |
+
''' corpus should be in the format:
|
245 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
246 |
+
corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
|
247 |
+
corSize = len(corpus)
|
248 |
+
# less than 2000
|
249 |
+
if 0 < corSize < 2000:
|
250 |
+
window=3.5
|
251 |
+
vector_size=75
|
252 |
+
sample=1e-3
|
253 |
+
negative=10
|
254 |
+
epochs=10
|
255 |
+
sg=1
|
256 |
+
# 2000 - 100000
|
257 |
+
elif 2000 <= corSize < 100000:
|
258 |
+
window=3.5
|
259 |
+
vector_size=75
|
260 |
+
sample=1e-5
|
261 |
+
negative=10
|
262 |
+
epochs=10
|
263 |
+
sg=1
|
264 |
+
elif 100000 <=corSize < 1000000:
|
265 |
+
window=7.5
|
266 |
+
vector_size=150
|
267 |
+
sample=1e-5
|
268 |
+
negative=10
|
269 |
+
epochs=6
|
270 |
+
sg=0
|
271 |
+
return window, vector_size, sample, negative, epochs, sg
|
272 |
+
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
|
273 |
+
vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
|
274 |
+
# if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
|
275 |
+
jsonFile = ""
|
276 |
+
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
277 |
+
cores = multiprocessing.cpu_count()
|
278 |
+
combinedCorpus = []
|
279 |
+
window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
|
280 |
+
if len(jsonFile) > 0:
|
281 |
+
for key in jsonFile:
|
282 |
+
combinedCorpus.extend(jsonFile[key])
|
283 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
284 |
+
# # min_count=1 ensures all words are included
|
285 |
+
'''w2vModel = Word2Vec(
|
286 |
+
min_count=1,
|
287 |
+
window=window,
|
288 |
+
vector_size=vector_size,
|
289 |
+
sample=sample,
|
290 |
+
alpha=0.03,
|
291 |
+
min_alpha=0.0007,
|
292 |
+
negative=negative,
|
293 |
+
workers=cores-1,
|
294 |
+
epochs = epochs,
|
295 |
+
sg=sg)'''
|
296 |
+
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
|
297 |
+
accept = False
|
298 |
+
while not accept:
|
299 |
+
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
300 |
+
try:
|
301 |
+
w2vModel = Word2Vec(
|
302 |
+
min_count=1,
|
303 |
+
window=window,
|
304 |
+
vector_size=vector_size,
|
305 |
+
sample=sample,
|
306 |
+
alpha=0.03,
|
307 |
+
min_alpha=0.0007,
|
308 |
+
negative=negative,
|
309 |
+
workers=cores-1,
|
310 |
+
epochs = epochs,
|
311 |
+
sg=sg)
|
312 |
+
w2vModel.build_vocab(combinedCorpus)
|
313 |
+
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
314 |
+
accept = True
|
315 |
+
except:
|
316 |
+
for key in jsonFile:
|
317 |
+
combinedCorpus.extend(jsonFile[key])
|
318 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
319 |
+
print("next is " + str(len(combinedCorpus)))
|
320 |
+
else:
|
321 |
+
print("no parameter to train")
|
322 |
+
break
|
323 |
+
#w2vModel.build_vocab(combinedCorpus)
|
324 |
+
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
325 |
+
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
326 |
+
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
327 |
+
w2vModel.save(saveFolder+"/"+modelName+".model")
|
328 |
+
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
329 |
+
print("done w2v")
|
330 |
+
else: print("no corpus to train")
|
331 |
+
#return combinedCorpus
|
332 |
+
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
333 |
+
# might not be a meaningful keyword
|
334 |
+
#stopWords = ["show"]
|
335 |
+
# same word but just plural nouns, tense
|
336 |
+
simWords = [word+"s",word+"es",word+"ing",word+"ed"]
|
337 |
+
model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
|
338 |
+
results = model.most_similar(positive=[word],topn=n)
|
339 |
+
#removeIndex = []
|
340 |
+
#currN = copy.deepcopy(n)
|
341 |
+
'''for r in range(len(results)):
|
342 |
+
if len(results[r][0]) < 2:
|
343 |
+
removeIndex.append(results[r])
|
344 |
+
# remove the same word but just plural and singular noun and lower than the cos_thres
|
345 |
+
elif results[r][0] == word:
|
346 |
+
removeIndex.append(results[r])
|
347 |
+
elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
|
348 |
+
removeIndex.append(results[r])
|
349 |
+
for rem in removeIndex:
|
350 |
+
results.remove(rem)
|
351 |
+
while len(results)!=n and len(results) != 0:
|
352 |
+
moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
|
353 |
+
if moreNewResult not in results and len(moreNewResult[0])>1:
|
354 |
+
if moreNewResult[0] not in stopWords and results[0] != word:
|
355 |
+
results.append(moreNewResult)
|
356 |
+
currN +=1'''
|
357 |
+
return results
|
358 |
+
# adding our model into spacy
|
359 |
+
# this deals with command line; but instead of using it, we write python script to run command line
|
360 |
+
def loadWordVec(self,modelName,wordVec):
|
361 |
+
# modelName is the name you want to save into spacy
|
362 |
+
# wordVec is the trained word2vec in txt format
|
363 |
+
subprocess.run([sys.executable,
|
364 |
+
"-m",
|
365 |
+
"spacy",
|
366 |
+
"init-model",
|
367 |
+
"en",
|
368 |
+
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
369 |
+
"--vectors-loc",
|
370 |
+
wordVec])
|
371 |
+
=======
|
372 |
'''WORD TO VECTOR'''
|
373 |
import pandas as pd
|
374 |
import json
|
|
|
381 |
from gensim.models.word2vec import Word2Vec
|
382 |
from gensim.scripts.glove2word2vec import glove2word2vec
|
383 |
from gensim.test.utils import datapath, get_tmpfile
|
384 |
+
from gensim.models import Phrases
|
385 |
+
from gensim.models.phrases import Phraser
|
386 |
import sys
|
387 |
import subprocess
|
388 |
+
import os
|
389 |
# can try multiprocessing to run quicker
|
390 |
import multiprocessing
|
391 |
import copy
|
|
|
406 |
def __init__(self, nameFile=None, modelName=None):
|
407 |
self.nameFile = nameFile
|
408 |
self.modelName = modelName
|
409 |
+
#self.nlp = spacy.load("en_core_web_lg")
|
410 |
+
self.cl = cleanText.cleanGenText()
|
411 |
def spacy_similarity(self, word):
|
412 |
# when use word2vec, try medium or large is better
|
413 |
# maybe try odc similarity?
|
414 |
+
doc = self.nlp(word)
|
|
|
415 |
for token1 in doc:
|
416 |
for token2 in doc:
|
417 |
print(token1.text, token2.text, token1.similarity(token2))
|
418 |
pass
|
419 |
# clean text before transform to corpus
|
420 |
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
421 |
+
#cl = cleanText.cleanGenText()
|
422 |
#cl = cleanGenText()
|
423 |
output = ""
|
424 |
alreadyRemoveDoi = False
|
|
|
426 |
# remove DOI
|
427 |
if doi != None and doi in oriText:
|
428 |
if alreadyRemoveDoi == False:
|
429 |
+
newWord = self.cl.removeDOI(word,doi)
|
430 |
if len(newWord) > 0 and newWord != word:
|
431 |
alreadyRemoveDoi = True
|
432 |
word = newWord
|
|
|
434 |
# split the sticked words
|
435 |
#word = cl.splitStickWords(word)
|
436 |
# remove punctuation
|
437 |
+
word = self.cl.removePunct(word,True)
|
438 |
# remove URL
|
439 |
+
word = self.cl.removeURL(word)
|
440 |
# remove HTMLTag
|
441 |
+
word = self.cl.removeHTMLTag(word)
|
442 |
# remove tab, white space, newline
|
443 |
+
word = self.cl.removeTabWhiteSpaceNewLine(word)
|
444 |
# optional: remove stopwords
|
445 |
#word = cl.removeStopWords(word)
|
446 |
if len(word)>0:
|
|
|
450 |
cleanOutput = ""
|
451 |
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
452 |
if len(allText) > 0:
|
453 |
+
corpusText = allText.split("\n\n")
|
454 |
+
for pos in range(len(corpusText)):
|
455 |
+
lines = corpusText[pos]
|
456 |
+
if len(lines) > 0:
|
457 |
for line in lines.split("\n"):
|
458 |
if remove in line: line = line.replace(remove, "")
|
459 |
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
460 |
cleanOutput += clean_text + "\n"
|
461 |
cleanOutput += "\n\n"
|
462 |
return cleanOutput
|
463 |
+
import urllib.parse, requests
|
464 |
+
|
465 |
def tableTransformToCorpusText(self, df, excelFile=None):
|
466 |
# PDF, Excel, WordDoc
|
467 |
#cl = cleanText.cleanGenText()
|
|
|
496 |
try:
|
497 |
df = pd.ExcelFile(excelFile)
|
498 |
except:
|
499 |
+
if excelFile.endswith('.xls'):
|
500 |
+
df = pd.read_excel(excelFile, engine='xlrd')
|
501 |
else:
|
502 |
+
df = pd.read_excel(excelFile, engine='openpyxl')
|
503 |
sheetNames = df.sheet_names
|
504 |
output = []
|
505 |
if len(sheetNames) > 0:
|
|
|
519 |
return corpus
|
520 |
def helperRowTableToCorpus(self, textList):
|
521 |
#cl = cleanGenText()
|
522 |
+
#cl = cleanText.cleanGenText()
|
523 |
stopWords = ["NaN","Unnamed:","nan"]
|
524 |
outputDF = []
|
525 |
for line in textList:
|
|
|
531 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
532 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
533 |
#word = cl.splitStickWords(word)
|
534 |
+
word = self.cl.removePunct(word)
|
535 |
+
word = " ".join(self.cl.removeStopWords(word))
|
536 |
+
word = self.cl.removeTabWhiteSpaceNewLine(word)
|
537 |
if len(word) > 1:
|
538 |
if len(word.split(" ")) > 1:
|
539 |
for x in word.split(" "):
|
|
|
547 |
return outputDF
|
548 |
def helperColTableToCorpus(self, dfList):
|
549 |
#cl = cleanGenText()
|
550 |
+
#cl = cleanText.cleanGenText()
|
551 |
stopWords = ["NaN","Unnamed:","nan"]
|
552 |
outputDF = []
|
553 |
# use the first length line as the column ref
|
|
|
563 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
564 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
565 |
#word = cl.splitStickWords(word)
|
566 |
+
word = self.cl.removePunct(word)
|
567 |
+
word = " ".join(self.cl.removeStopWords(word))
|
568 |
+
word = self.cl.removeTabWhiteSpaceNewLine(word)
|
569 |
if len(word) > 1:
|
570 |
if len(word.split(" ")) > 1:
|
571 |
for x in word.split(" "):
|
|
|
593 |
Mouse is an animal.
|
594 |
Jerry is mouse.'''
|
595 |
texts = {}
|
596 |
+
#cl = cleanText.cleanGenText()
|
597 |
#cl = cleanGenText()
|
598 |
+
corpus = corpusText.split("\n\n")
|
599 |
+
for pos in range(len(corpus)):
|
600 |
+
if len(corpus[pos]) > 0:
|
601 |
texts["Paragraph "+str(pos)] = []
|
602 |
+
lines = corpus[pos]
|
603 |
for line in lines.split("\n"):
|
604 |
for l in line.split("."):
|
605 |
if len(l) > 0:
|
606 |
+
l = self.cl.removeTabWhiteSpaceNewLine(l)
|
607 |
l = l.lower()
|
608 |
newL = []
|
609 |
for word in l.split(" "):
|
610 |
if len(word) > 0:
|
611 |
+
word = self.cl.removeStopWords(word)
|
612 |
for w in word:
|
613 |
if len(w) > 0 and w.isnumeric()==False:
|
614 |
newL.append(w)
|
|
|
617 |
if len(texts["Paragraph "+str(pos)]) == 0:
|
618 |
del texts["Paragraph "+str(pos)]
|
619 |
return texts
|
620 |
+
|
621 |
+
def selectParaForWC(self, corpus):
|
622 |
+
"""
|
623 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
|
624 |
+
Heuristically determine Word2Vec parameters.
|
625 |
+
"""
|
626 |
corSize = len(corpus)
|
627 |
+
|
628 |
+
if corSize == 0:
|
629 |
+
return None, None, None, None, None, None
|
630 |
+
|
631 |
+
# Adjust parameters based on corpus size
|
632 |
+
if corSize < 2000:
|
633 |
+
# Small corpus — need high generalization
|
634 |
+
window = 3
|
635 |
+
vector_size = 100
|
636 |
+
sample = 1e-3
|
637 |
+
negative = 5
|
638 |
+
epochs = 20
|
639 |
+
sg = 1 # Skip-gram preferred for rare words
|
640 |
+
elif corSize < 10000:
|
641 |
+
window = 5
|
642 |
+
vector_size = 150
|
643 |
+
sample = 1e-4
|
644 |
+
negative = 10
|
645 |
+
epochs = 20
|
646 |
+
sg = 1
|
647 |
+
elif corSize < 100000:
|
648 |
+
window = 7
|
649 |
+
vector_size = 200
|
650 |
+
sample = 1e-5
|
651 |
+
negative = 15
|
652 |
+
epochs = 15
|
653 |
+
sg = 1
|
654 |
+
elif corSize < 500000:
|
655 |
+
window = 10
|
656 |
+
vector_size = 250
|
657 |
+
sample = 1e-5
|
658 |
+
negative = 15
|
659 |
+
epochs = 10
|
660 |
+
sg = 0 # CBOW is okay when data is large
|
661 |
+
else:
|
662 |
+
# Very large corpus
|
663 |
+
window = 12
|
664 |
+
vector_size = 300
|
665 |
+
sample = 1e-6
|
666 |
+
negative = 20
|
667 |
+
epochs = 5
|
668 |
+
sg = 0
|
669 |
+
|
670 |
return window, vector_size, sample, negative, epochs, sg
|
671 |
+
|
672 |
+
|
673 |
+
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
|
674 |
+
vector_size=None,sample=None,negative=None,epochs=None,sg=None):
|
675 |
jsonFile = ""
|
676 |
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
677 |
+
if not jsonFile:
|
678 |
+
print("No corpus to train")
|
679 |
+
return
|
680 |
cores = multiprocessing.cpu_count()
|
681 |
combinedCorpus = []
|
682 |
+
for key in jsonFile:
|
683 |
+
combinedCorpus.extend(jsonFile[key])
|
684 |
+
# detect phrase before choosing parameters
|
685 |
+
phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
|
686 |
+
bigram = Phraser(phrases)
|
687 |
+
combinedCorpus = [bigram[sent] for sent in combinedCorpus]
|
688 |
+
|
689 |
+
if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
|
690 |
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
691 |
+
# # min_count=1 ensures all words are included
|
692 |
+
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
|
693 |
+
accept = False
|
694 |
+
# add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
|
695 |
+
retries = 0
|
696 |
+
while not accept and retries < 3:
|
697 |
+
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
698 |
+
try:
|
699 |
+
w2vModel = Word2Vec(
|
700 |
min_count=1,
|
701 |
window=window,
|
702 |
vector_size=vector_size,
|
|
|
706 |
negative=negative,
|
707 |
workers=cores-1,
|
708 |
epochs = epochs,
|
709 |
+
sg=sg)
|
710 |
+
w2vModel.build_vocab(combinedCorpus)
|
711 |
+
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
|
712 |
+
accept = True
|
713 |
+
except Exception as e:
|
714 |
+
print(f"Retry #{retries+1} failed: {e}")
|
715 |
+
retries +=1
|
716 |
+
else:
|
717 |
+
print("no parameter to train")
|
718 |
+
break
|
719 |
+
#w2vModel.build_vocab(combinedCorpus)
|
720 |
+
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
721 |
+
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
722 |
+
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
723 |
+
w2vModel.save(saveFolder+"/"+modelName+".model")
|
724 |
+
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
725 |
+
print("done w2v")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
#return combinedCorpus
|
727 |
+
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
|
728 |
+
if not newCorpus:
|
729 |
+
raise ValueError("New corpus is empty!")
|
730 |
+
|
731 |
+
model = Word2Vec.load(modelPath)
|
732 |
+
|
733 |
+
# Phrase detection on new data
|
734 |
+
phrases = Phrases(newCorpus, min_count=2, threshold=10)
|
735 |
+
bigram = Phraser(phrases)
|
736 |
+
newCorpus = [bigram[sent] for sent in newCorpus]
|
737 |
+
|
738 |
+
# Update vocab & retrain
|
739 |
+
model.build_vocab(newCorpus, update=True)
|
740 |
+
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
|
741 |
+
|
742 |
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
743 |
# might not be a meaningful keyword
|
744 |
#stopWords = ["show"]
|
|
|
765 |
results.append(moreNewResult)
|
766 |
currN +=1'''
|
767 |
return results
|
768 |
+
# add more data to existing word2vec model
|
769 |
+
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
|
770 |
+
if not newCorpus:
|
771 |
+
raise ValueError("New corpus is empty!")
|
772 |
+
|
773 |
+
model = Word2Vec.load(modelPath)
|
774 |
+
|
775 |
+
# Phrase detection on new data
|
776 |
+
phrases = Phrases(newCorpus, min_count=2, threshold=10)
|
777 |
+
bigram = Phraser(phrases)
|
778 |
+
newCorpus = [bigram[sent] for sent in newCorpus]
|
779 |
+
|
780 |
+
# Update vocab & retrain
|
781 |
+
model.build_vocab(newCorpus, update=True)
|
782 |
+
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
|
783 |
+
|
784 |
+
# Save updated model
|
785 |
+
if saveFolder:
|
786 |
+
os.makedirs(saveFolder, exist_ok=True)
|
787 |
+
name = os.path.basename(modelPath).replace(".model", "_updated.model")
|
788 |
+
model.save(f"{saveFolder}/{name}")
|
789 |
+
print(f"🔁 Model updated and saved to {saveFolder}/{name}")
|
790 |
+
else:
|
791 |
+
model.save(modelPath)
|
792 |
+
print(f"🔁 Model updated and overwritten at {modelPath}")
|
793 |
+
|
794 |
# adding our model into spacy
|
795 |
# this deals with command line; but instead of using it, we write python script to run command line
|
796 |
def loadWordVec(self,modelName,wordVec):
|
|
|
804 |
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
805 |
"--vectors-loc",
|
806 |
wordVec])
|
807 |
+
>>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI before moving to UpdateAppUI)
|
808 |
print("done")
|
README.md
CHANGED
@@ -1,15 +1,74 @@
|
|
1 |
-
---
|
2 |
-
setup: bash setup.sh
|
3 |
-
title: MtDNALocation
|
4 |
-
emoji: 📊
|
5 |
-
colorFrom: blue
|
6 |
-
colorTo: purple
|
7 |
-
sdk: gradio
|
8 |
-
sdk_version: 5.25.0
|
9 |
-
app_file: app.py
|
10 |
-
pinned: false
|
11 |
-
license: mit
|
12 |
-
short_description: mtDNA Location Classification tool
|
13 |
-
---
|
14 |
-
|
15 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
setup: bash setup.sh
|
3 |
+
title: MtDNALocation
|
4 |
+
emoji: 📊
|
5 |
+
colorFrom: blue
|
6 |
+
colorTo: purple
|
7 |
+
sdk: gradio
|
8 |
+
sdk_version: 5.25.0
|
9 |
+
app_file: app.py
|
10 |
+
pinned: false
|
11 |
+
license: mit
|
12 |
+
short_description: mtDNA Location Classification tool
|
13 |
+
---
|
14 |
+
|
15 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
16 |
+
|
17 |
+
# Installation
|
18 |
+
## Set up environments and start GUI:
|
19 |
+
```bash
|
20 |
+
git clone https://github.com/Open-Access-Bio-Data/mtDNA-Location-Classifier.git
|
21 |
+
```
|
22 |
+
If installed using mamba (recommended):
|
23 |
+
```bash
|
24 |
+
mamba env create -f env.yaml
|
25 |
+
```
|
26 |
+
If not, check current python version in terminal and make sure that it is python version 3.10, then run
|
27 |
+
```bash
|
28 |
+
pip install -r requirements.txt
|
29 |
+
```
|
30 |
+
To start the programme, run this in terminal:
|
31 |
+
```bash
|
32 |
+
python app.py
|
33 |
+
```
|
34 |
+
Then follow its instructions
|
35 |
+
# Descriptions:
|
36 |
+
mtDNA-Location-Classifier uses [Gradio](https://www.gradio.app/docs) to handle the front-end interactions.
|
37 |
+
|
38 |
+
The programme takes **an accession number** (an NCBI GenBank/nuccore identifier) as input and returns the likely origin of the sequence through `classify_sample_location_cached(accession=accession_number)`. This function wraps around a pipeline that proceeds as follow:
|
39 |
+
## Steps 1-3: Check and retrieve base materials: the Pubmed ID, isolate, DOI and text:
|
40 |
+
- Which are respectively:
|
41 |
+
|
42 |
+
### Step 1: pubmed_ids and isolates
|
43 |
+
`get_info_from accession(accession=accession_number)`
|
44 |
+
- Current input is a string of `accession_number` and output are two lists, one of PUBMED IDs and one of isolate(s).
|
45 |
+
- Which look through the metadata of the sequence with `accession_number` and extract `PUBMED ID` if available or `isolate` information.
|
46 |
+
- The presence of PUBMED ID is currently important for the retrieval of texts in the next steps, which are eventually used by method 4.1 (question-answering) and 4.2 (infer from haplogroup)
|
47 |
+
- Some sequences might not have `isolate` info but its availibity is optional. (as they might be used by method 4.1 and 4.2 as alternative)
|
48 |
+
|
49 |
+
### Step 2: dois
|
50 |
+
`get_doi_from_pubmed_id(pubmed_ids = pubmed_ids)`
|
51 |
+
- Input is a list of PUBMED IDs of the sequence with `accession_number` (retrieved from previous step) and output is a dictionary with keys = PUBMED IDs and values = according DOIs.
|
52 |
+
- The pubmed_ids are retrieved from the `get_info_from accession(accession=accession_number)` mentioned above.
|
53 |
+
- The DOIs will be passed down to dependent functions to extract texts of publications to pass on to method 4.1 and 4.2
|
54 |
+
|
55 |
+
### Step 3: get text
|
56 |
+
`get_paper_text(dois = dois)`
|
57 |
+
- Input is currently a list of dois retrieved from previous step and output is a dictionary with keys = sources (doi links or file type) (We might improve this to have other inputs in addition to just doi links - maybe files); values = texts obtained from sources.
|
58 |
+
- Output of this step is crucial to method 4.1 and 4.2
|
59 |
+
|
60 |
+
|
61 |
+
## Step 4: Prediction of origin:
|
62 |
+
### Method 4.0:
|
63 |
+
- The first method attempts to directly look in the metadata for information that was submitted along with the sequence. Thus, it does not require availability of PUBMED IDs/DOIs or isolates.
|
64 |
+
- However, this information is not always available in the submission. Thus, we use other methods (4.1,4.2) to retrieve publications through which we can extract the information of the source of mtDNA
|
65 |
+
|
66 |
+
### Method 4.1:
|
67 |
+
-
|
68 |
+
|
69 |
+
### Method 4.2:
|
70 |
+
-
|
71 |
+
|
72 |
+
## More in the package
|
73 |
+
### extraction of text from HTML
|
74 |
+
### extraction of text from PDF
|
accessions.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Accession
|
2 |
+
KU131308
|
3 |
+
JX123456
|
4 |
+
MN908947
|
5 |
+
AB123456
|
6 |
+
AY123456
|
accessions.xlsx
ADDED
Binary file (4.98 kB). View file
|
|
app.py
CHANGED
@@ -1,3 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
|
2 |
|
3 |
import gradio as gr
|
@@ -5,9 +179,15 @@ from collections import Counter
|
|
5 |
import csv
|
6 |
import os
|
7 |
from functools import lru_cache
|
8 |
-
from mtdna_classifier import classify_sample_location
|
9 |
import subprocess
|
10 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
@lru_cache(maxsize=128)
|
13 |
def classify_sample_location_cached(accession):
|
@@ -33,8 +213,6 @@ def compute_final_suggested_location(rows):
|
|
33 |
return counts, (top_location, count)
|
34 |
|
35 |
# Store feedback (with required fields)
|
36 |
-
import gspread
|
37 |
-
from oauth2client.service_account import ServiceAccountCredentials
|
38 |
|
39 |
'''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
40 |
|
@@ -58,11 +236,6 @@ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
|
58 |
except Exception as e:
|
59 |
return f"❌ Error submitting feedback: {str(e)}"'''
|
60 |
|
61 |
-
import os
|
62 |
-
import json
|
63 |
-
from oauth2client.service_account import ServiceAccountCredentials
|
64 |
-
import gspread
|
65 |
-
|
66 |
def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
67 |
if not answer1.strip() or not answer2.strip():
|
68 |
return "⚠️ Please answer both questions before submitting."
|
@@ -84,16 +257,44 @@ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
|
84 |
except Exception as e:
|
85 |
return f"❌ Error submitting feedback: {e}"
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
def summarize_results(accession):
|
89 |
try:
|
90 |
-
output = classify_sample_location_cached(accession)
|
91 |
print(output)
|
92 |
except Exception as e:
|
93 |
-
return [], f"
|
94 |
|
95 |
if accession not in output:
|
96 |
-
return [], "
|
97 |
|
98 |
isolate = next((k for k in output if k != accession), None)
|
99 |
row_score = []
|
@@ -110,7 +311,7 @@ def summarize_results(accession):
|
|
110 |
haplogroup = content.get("haplogroup", "")
|
111 |
inferred = content.get("inferred_location", "")
|
112 |
context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
|
113 |
-
|
114 |
row = {
|
115 |
"Sample ID": sample_id_label,
|
116 |
"Technique": technique,
|
@@ -130,43 +331,202 @@ def summarize_results(accession):
|
|
130 |
summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
|
131 |
summary = "\n".join(summary_lines)
|
132 |
|
133 |
-
return rows, summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
# Gradio UI
|
135 |
with gr.Blocks() as interface:
|
136 |
gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
with gr.Row():
|
140 |
-
accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
|
141 |
run_button = gr.Button("🔍 Submit and Classify")
|
142 |
reset_button = gr.Button("🔄 Reset")
|
143 |
|
144 |
status = gr.Markdown(visible=False)
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
return gr.update(value="⏳ Please wait... processing...", visible=True)
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
def classify_main(accession):
|
161 |
-
table, summary = summarize_results(accession)
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
def reset_fields():
|
165 |
-
return
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
interface.launch(share=True)
|
|
|
|
1 |
+
<<<<<<< HEAD
|
2 |
+
# ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
from collections import Counter
|
6 |
+
import csv
|
7 |
+
import os
|
8 |
+
from functools import lru_cache
|
9 |
+
from mtdna_classifier import classify_sample_location
|
10 |
+
import subprocess
|
11 |
+
import json
|
12 |
+
|
13 |
+
@lru_cache(maxsize=128)
|
14 |
+
def classify_sample_location_cached(accession):
|
15 |
+
return classify_sample_location(accession)
|
16 |
+
|
17 |
+
# Count and suggest final location
|
18 |
+
def compute_final_suggested_location(rows):
|
19 |
+
candidates = [
|
20 |
+
row.get("Predicted Location", "").strip()
|
21 |
+
for row in rows
|
22 |
+
if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
|
23 |
+
] + [
|
24 |
+
row.get("Inferred Region", "").strip()
|
25 |
+
for row in rows
|
26 |
+
if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
|
27 |
+
]
|
28 |
+
|
29 |
+
if not candidates:
|
30 |
+
return Counter(), ("Unknown", 0)
|
31 |
+
|
32 |
+
counts = Counter(candidates)
|
33 |
+
top_location, count = counts.most_common(1)[0]
|
34 |
+
return counts, (top_location, count)
|
35 |
+
|
36 |
+
# Store feedback (with required fields)
|
37 |
+
import gspread
|
38 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
39 |
+
|
40 |
+
'''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
41 |
+
|
42 |
+
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
|
43 |
+
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
|
44 |
+
|
45 |
+
def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
46 |
+
if not answer1.strip() or not answer2.strip():
|
47 |
+
return "⚠️ Please answer both questions before submitting."
|
48 |
+
|
49 |
+
try:
|
50 |
+
# Define the scope and authenticate
|
51 |
+
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
|
52 |
+
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
|
53 |
+
client = gspread.authorize(creds)
|
54 |
+
|
55 |
+
# Open the spreadsheet and worksheet
|
56 |
+
sheet = client.open("feedback_mtdna").sheet1 # You can change the name
|
57 |
+
sheet.append_row([accession, answer1, answer2, contact])
|
58 |
+
return "✅ Feedback submitted. Thank you!"
|
59 |
+
except Exception as e:
|
60 |
+
return f"❌ Error submitting feedback: {str(e)}"'''
|
61 |
+
|
62 |
+
import os
|
63 |
+
import json
|
64 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
65 |
+
import gspread
|
66 |
+
|
67 |
+
def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
68 |
+
if not answer1.strip() or not answer2.strip():
|
69 |
+
return "⚠️ Please answer both questions before submitting."
|
70 |
+
|
71 |
+
try:
|
72 |
+
# ✅ Step: Load credentials from Hugging Face secret
|
73 |
+
creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
74 |
+
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
|
75 |
+
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
|
76 |
+
|
77 |
+
# Connect to Google Sheet
|
78 |
+
client = gspread.authorize(creds)
|
79 |
+
sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
|
80 |
+
|
81 |
+
# Append feedback
|
82 |
+
sheet.append_row([accession, answer1, answer2, contact])
|
83 |
+
return "✅ Feedback submitted. Thank you!"
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
return f"❌ Error submitting feedback: {e}"
|
87 |
+
|
88 |
+
|
89 |
+
def summarize_results(accession):
|
90 |
+
try:
|
91 |
+
output = classify_sample_location_cached(accession)
|
92 |
+
print(output)
|
93 |
+
except Exception as e:
|
94 |
+
return [], f"❌ Error: {e}"
|
95 |
+
|
96 |
+
if accession not in output:
|
97 |
+
return [], "❌ Accession not found in results."
|
98 |
+
|
99 |
+
isolate = next((k for k in output if k != accession), None)
|
100 |
+
row_score = []
|
101 |
+
rows = []
|
102 |
+
|
103 |
+
for key in [accession, isolate]:
|
104 |
+
if key not in output:
|
105 |
+
continue
|
106 |
+
sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
|
107 |
+
for section, techniques in output[key].items():
|
108 |
+
for technique, content in techniques.items():
|
109 |
+
source = content.get("source", "")
|
110 |
+
predicted = content.get("predicted_location", "")
|
111 |
+
haplogroup = content.get("haplogroup", "")
|
112 |
+
inferred = content.get("inferred_location", "")
|
113 |
+
context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
|
114 |
+
|
115 |
+
row = {
|
116 |
+
"Sample ID": sample_id_label,
|
117 |
+
"Technique": technique,
|
118 |
+
"Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
|
119 |
+
"Predicted Location": "" if technique == "haplogroup" else predicted,
|
120 |
+
"Haplogroup": haplogroup if technique == "haplogroup" else "",
|
121 |
+
"Inferred Region": inferred if technique == "haplogroup" else "",
|
122 |
+
"Context Snippet": context
|
123 |
+
}
|
124 |
+
|
125 |
+
row_score.append(row)
|
126 |
+
rows.append(list(row.values()))
|
127 |
+
|
128 |
+
location_counts, (final_location, count) = compute_final_suggested_location(row_score)
|
129 |
+
summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
|
130 |
+
summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
|
131 |
+
summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
|
132 |
+
summary = "\n".join(summary_lines)
|
133 |
+
|
134 |
+
return rows, summary
|
135 |
+
# Gradio UI
|
136 |
+
with gr.Blocks() as interface:
|
137 |
+
gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
|
138 |
+
gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
|
139 |
+
|
140 |
+
with gr.Row():
|
141 |
+
accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
|
142 |
+
run_button = gr.Button("🔍 Submit and Classify")
|
143 |
+
reset_button = gr.Button("🔄 Reset")
|
144 |
+
|
145 |
+
status = gr.Markdown(visible=False)
|
146 |
+
headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
|
147 |
+
output_table = gr.Dataframe(headers=headers, interactive=False)
|
148 |
+
output_summary = gr.Markdown()
|
149 |
+
|
150 |
+
gr.Markdown("---")
|
151 |
+
gr.Markdown("### 💬 Feedback (required)")
|
152 |
+
q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
|
153 |
+
q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
|
154 |
+
contact = gr.Textbox(label="📧 Your email or institution (optional)")
|
155 |
+
submit_feedback = gr.Button("✅ Submit Feedback")
|
156 |
+
feedback_status = gr.Markdown()
|
157 |
+
|
158 |
+
def classify_with_loading(accession):
|
159 |
+
return gr.update(value="⏳ Please wait... processing...", visible=True)
|
160 |
+
|
161 |
+
def classify_main(accession):
|
162 |
+
table, summary = summarize_results(accession)
|
163 |
+
return table, summary, gr.update(visible=False)
|
164 |
+
|
165 |
+
def reset_fields():
|
166 |
+
return "", "", "", "", "", [], "", gr.update(visible=False)
|
167 |
+
|
168 |
+
run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
|
169 |
+
run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
|
170 |
+
submit_feedback.click(fn=store_feedback_to_google_sheets, inputs=[accession, q1, q2, contact], outputs=feedback_status)
|
171 |
+
reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
|
172 |
+
|
173 |
+
interface.launch(share=True)
|
174 |
+
=======
|
175 |
# ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
|
176 |
|
177 |
import gradio as gr
|
|
|
179 |
import csv
|
180 |
import os
|
181 |
from functools import lru_cache
|
182 |
+
from mtdna_classifier import classify_sample_location
|
183 |
import subprocess
|
184 |
import json
|
185 |
+
import pandas as pd
|
186 |
+
import io
|
187 |
+
import re
|
188 |
+
import tempfile
|
189 |
+
import gspread
|
190 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
191 |
|
192 |
@lru_cache(maxsize=128)
|
193 |
def classify_sample_location_cached(accession):
|
|
|
213 |
return counts, (top_location, count)
|
214 |
|
215 |
# Store feedback (with required fields)
|
|
|
|
|
216 |
|
217 |
'''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
218 |
|
|
|
236 |
except Exception as e:
|
237 |
return f"❌ Error submitting feedback: {str(e)}"'''
|
238 |
|
|
|
|
|
|
|
|
|
|
|
239 |
def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
240 |
if not answer1.strip() or not answer2.strip():
|
241 |
return "⚠️ Please answer both questions before submitting."
|
|
|
257 |
except Exception as e:
|
258 |
return f"❌ Error submitting feedback: {e}"
|
259 |
|
260 |
+
# helper function to extract accessions
|
261 |
+
def extract_accessions_from_input(file=None, raw_text=""):
|
262 |
+
print(f"RAW TEXT RECEIVED: {raw_text}")
|
263 |
+
accessions = []
|
264 |
+
seen = set()
|
265 |
+
if file:
|
266 |
+
try:
|
267 |
+
if file.name.endswith(".csv"):
|
268 |
+
df = pd.read_csv(file)
|
269 |
+
elif file.name.endswith(".xlsx"):
|
270 |
+
df = pd.read_excel(file)
|
271 |
+
else:
|
272 |
+
return [], "Unsupported file format. Please upload CSV or Excel."
|
273 |
+
for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
|
274 |
+
if acc not in seen:
|
275 |
+
accessions.append(acc)
|
276 |
+
seen.add(acc)
|
277 |
+
except Exception as e:
|
278 |
+
return [], f"Failed to read file: {e}"
|
279 |
+
|
280 |
+
if raw_text:
|
281 |
+
text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
|
282 |
+
for acc in text_ids:
|
283 |
+
if acc not in seen:
|
284 |
+
accessions.append(acc)
|
285 |
+
seen.add(acc)
|
286 |
+
|
287 |
+
return list(accessions), None
|
288 |
|
289 |
def summarize_results(accession):
|
290 |
try:
|
291 |
+
output, labelAncient_Modern, explain_label = classify_sample_location_cached(accession)
|
292 |
print(output)
|
293 |
except Exception as e:
|
294 |
+
return [], f"Error: {e}"
|
295 |
|
296 |
if accession not in output:
|
297 |
+
return [], "Accession not found in results."
|
298 |
|
299 |
isolate = next((k for k in output if k != accession), None)
|
300 |
row_score = []
|
|
|
311 |
haplogroup = content.get("haplogroup", "")
|
312 |
inferred = content.get("inferred_location", "")
|
313 |
context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
|
314 |
+
|
315 |
row = {
|
316 |
"Sample ID": sample_id_label,
|
317 |
"Technique": technique,
|
|
|
331 |
summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
|
332 |
summary = "\n".join(summary_lines)
|
333 |
|
334 |
+
return rows, summary, labelAncient_Modern, explain_label
|
335 |
+
|
336 |
+
# save the batch input in excel file
|
337 |
+
def save_to_excel(all_rows, summary_text, flag_text, filename):
|
338 |
+
with pd.ExcelWriter(filename) as writer:
|
339 |
+
# Save table
|
340 |
+
df = pd.DataFrame(all_rows, columns=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"])
|
341 |
+
df.to_excel(writer, sheet_name="Detailed Results", index=False)
|
342 |
+
|
343 |
+
# Save summary
|
344 |
+
summary_df = pd.DataFrame({"Summary": [summary_text]})
|
345 |
+
summary_df.to_excel(writer, sheet_name="Summary", index=False)
|
346 |
+
|
347 |
+
# Save flag
|
348 |
+
flag_df = pd.DataFrame({"Flag": [flag_text]})
|
349 |
+
flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
|
350 |
+
|
351 |
+
# save the batch input in JSON file
|
352 |
+
def save_to_json(all_rows, summary_text, flag_text, filename):
|
353 |
+
output_dict = {
|
354 |
+
"Detailed_Results": all_rows,
|
355 |
+
"Summary_Text": summary_text,
|
356 |
+
"Ancient_Modern_Flag": flag_text
|
357 |
+
}
|
358 |
+
with open(filename, "w") as f:
|
359 |
+
json.dump(output_dict, f, indent=2)
|
360 |
+
|
361 |
+
# save the batch input in Text file
|
362 |
+
def save_to_txt(all_rows, summary_text, flag_text, filename):
|
363 |
+
with open(filename, "w") as f:
|
364 |
+
f.write("=== Detailed Results ===\n")
|
365 |
+
for row in all_rows:
|
366 |
+
f.write(", ".join(str(x) for x in row) + "\n")
|
367 |
+
|
368 |
+
f.write("\n=== Summary ===\n")
|
369 |
+
f.write(summary_text + "\n")
|
370 |
+
|
371 |
+
f.write("\n=== Ancient/Modern Flag ===\n")
|
372 |
+
f.write(flag_text + "\n")
|
373 |
+
|
374 |
+
def save_batch_output(all_rows, summary_text, flag_text, output_type):
|
375 |
+
tmp_dir = tempfile.mkdtemp()
|
376 |
+
|
377 |
+
if output_type == "Excel":
|
378 |
+
file_path = f"{tmp_dir}/batch_output.xlsx"
|
379 |
+
save_to_excel(all_rows, summary_text, flag_text, file_path)
|
380 |
+
elif output_type == "JSON":
|
381 |
+
file_path = f"{tmp_dir}/batch_output.json"
|
382 |
+
save_to_json(all_rows, summary_text, flag_text, file_path)
|
383 |
+
elif output_type == "TXT":
|
384 |
+
file_path = f"{tmp_dir}/batch_output.txt"
|
385 |
+
save_to_txt(all_rows, summary_text, flag_text, file_path)
|
386 |
+
else:
|
387 |
+
return None # invalid option
|
388 |
+
|
389 |
+
return file_path
|
390 |
+
|
391 |
+
# run the batch
|
392 |
+
def summarize_batch(file=None, raw_text=""):
|
393 |
+
accessions, error = extract_accessions_from_input(file, raw_text)
|
394 |
+
if error:
|
395 |
+
return [], "", "", f"Error: {error}"
|
396 |
+
|
397 |
+
all_rows = []
|
398 |
+
all_summaries = []
|
399 |
+
all_flags = []
|
400 |
+
|
401 |
+
for acc in accessions:
|
402 |
+
try:
|
403 |
+
rows, summary, label, explain = summarize_results(acc)
|
404 |
+
all_rows.extend(rows)
|
405 |
+
all_summaries.append(f"**{acc}**\n{summary}")
|
406 |
+
all_flags.append(f"**{acc}**: {label}\n_Explanation:_ {explain}")
|
407 |
+
except Exception as e:
|
408 |
+
all_summaries.append(f"**{acc}**: Failed - {e}")
|
409 |
+
|
410 |
+
summary_text = "\n\n---\n\n".join(all_summaries)
|
411 |
+
flag_text = "\n\n".join(all_flags)
|
412 |
+
|
413 |
+
return all_rows, summary_text, flag_text, gr.update(visible=False)
|
414 |
+
|
415 |
# Gradio UI
|
416 |
with gr.Blocks() as interface:
|
417 |
gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
|
418 |
+
|
419 |
+
inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
|
420 |
+
|
421 |
+
with gr.Group() as single_input_group:
|
422 |
+
single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
|
423 |
+
|
424 |
+
with gr.Group(visible=False) as batch_input_group:
|
425 |
+
raw_text = gr.Textbox(label="🧬 Paste Accession Numbers")
|
426 |
+
file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
|
427 |
+
print(raw_text)
|
428 |
+
# Make the file box smaller
|
429 |
+
gr.HTML('<style>#file-upload-box { width: 200px; }</style>')
|
430 |
|
431 |
with gr.Row():
|
|
|
432 |
run_button = gr.Button("🔍 Submit and Classify")
|
433 |
reset_button = gr.Button("🔄 Reset")
|
434 |
|
435 |
status = gr.Markdown(visible=False)
|
436 |
+
|
437 |
+
with gr.Group(visible=False) as results_group:
|
438 |
+
with gr.Row():
|
439 |
+
with gr.Column():
|
440 |
+
output_summary = gr.Markdown()
|
441 |
+
with gr.Column():
|
442 |
+
output_flag = gr.Markdown()
|
443 |
+
|
444 |
+
gr.Markdown("---")
|
445 |
+
output_table = gr.Dataframe(
|
446 |
+
headers=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"],
|
447 |
+
interactive=False,
|
448 |
+
row_count=(5, "dynamic")
|
449 |
+
)
|
450 |
+
|
451 |
+
with gr.Row():
|
452 |
+
output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
|
453 |
+
download_button = gr.Button("⬇️ Download Output")
|
454 |
+
download_file = gr.File(label="Download File Here")
|
455 |
+
|
456 |
+
gr.Markdown("---")
|
457 |
+
|
458 |
+
gr.Markdown("### 💬 Feedback (required)")
|
459 |
+
q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
|
460 |
+
q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
|
461 |
+
contact = gr.Textbox(label="📧 Your email or institution (optional)")
|
462 |
+
submit_feedback = gr.Button("✅ Submit Feedback")
|
463 |
+
feedback_status = gr.Markdown()
|
464 |
+
|
465 |
+
# Functions
|
466 |
+
|
467 |
+
def toggle_input_mode(mode):
|
468 |
+
if mode == "Single Accession":
|
469 |
+
return gr.update(visible=True), gr.update(visible=False)
|
470 |
+
else:
|
471 |
+
return gr.update(visible=False), gr.update(visible=True)
|
472 |
+
|
473 |
+
def classify_with_loading():
|
474 |
return gr.update(value="⏳ Please wait... processing...", visible=True)
|
475 |
|
476 |
+
def classify_dynamic(single_accession, file, text, mode):
|
477 |
+
print(f"MODE: {mode} | RAW TEXT: {text}")
|
478 |
+
if mode == "Single Accession":
|
479 |
+
return classify_main(single_accession)
|
480 |
+
else:
|
481 |
+
return summarize_batch(file, text)
|
482 |
+
|
483 |
def classify_main(accession):
|
484 |
+
table, summary, labelAncient_Modern, explain_label = summarize_results(accession)
|
485 |
+
flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
|
486 |
+
return (
|
487 |
+
table,
|
488 |
+
summary,
|
489 |
+
flag_output,
|
490 |
+
gr.update(visible=True),
|
491 |
+
gr.update(visible=False)
|
492 |
+
)
|
493 |
|
494 |
def reset_fields():
|
495 |
+
return (
|
496 |
+
gr.update(value=""), # single_accession
|
497 |
+
gr.update(value=""), # raw_text
|
498 |
+
gr.update(value=None), # file_upload
|
499 |
+
gr.update(value="Single Accession"), # inputMode
|
500 |
+
gr.update(value=[], visible=True), # output_table
|
501 |
+
gr.update(value="", visible=True), # output_summary
|
502 |
+
gr.update(value="", visible=True), # output_flag
|
503 |
+
gr.update(visible=False), # status
|
504 |
+
gr.update(visible=False) # results_group
|
505 |
+
)
|
506 |
+
|
507 |
+
inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
|
508 |
+
run_button.click(fn=classify_with_loading, inputs=[], outputs=status)
|
509 |
+
run_button.click(
|
510 |
+
fn=classify_dynamic,
|
511 |
+
inputs=[single_accession, file_upload, raw_text, inputMode],
|
512 |
+
outputs=[output_table, output_summary, output_flag, results_group, status]
|
513 |
+
)
|
514 |
+
reset_button.click(
|
515 |
+
fn=reset_fields,
|
516 |
+
inputs=[],
|
517 |
+
outputs=[
|
518 |
+
single_accession, raw_text, file_upload, inputMode,
|
519 |
+
output_table, output_summary, output_flag,
|
520 |
+
status, results_group
|
521 |
+
]
|
522 |
+
)
|
523 |
+
|
524 |
+
download_button.click(
|
525 |
+
save_batch_output, [output_table, output_summary, output_flag, output_type], download_file
|
526 |
+
)
|
527 |
+
submit_feedback.click(
|
528 |
+
fn=store_feedback_to_google_sheets, inputs=[single_accession, q1, q2, contact], outputs=feedback_status
|
529 |
+
)
|
530 |
|
531 |
interface.launch(share=True)
|
532 |
+
>>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI before moving to UpdateAppUI)
|
data/user_fb/feedback_mtdna.xlsx
ADDED
Binary file (5.93 kB). View file
|
|
env.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: mtDNA
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
dependencies:
|
5 |
+
- python=3.10
|
6 |
+
- pip
|
7 |
+
- pip:
|
8 |
+
- -r requirements.txt
|
installedAndUsedRequirements.txt
ADDED
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python_version==3.11.12
|
2 |
+
absl-py==1.4.0
|
3 |
+
accelerate==1.6.0
|
4 |
+
aiofiles==24.1.0
|
5 |
+
aiohappyeyeballs==2.6.1
|
6 |
+
aiohttp==3.11.15
|
7 |
+
aiosignal==1.3.2
|
8 |
+
alabaster==1.0.0
|
9 |
+
albucore==0.0.24
|
10 |
+
albumentations==2.0.6
|
11 |
+
ale-py==0.11.0
|
12 |
+
altair==5.5.0
|
13 |
+
annotated-types==0.7.0
|
14 |
+
anyio==4.9.0
|
15 |
+
argon2-cffi==23.1.0
|
16 |
+
argon2-cffi-bindings==21.2.0
|
17 |
+
array_record==0.7.2
|
18 |
+
arviz==0.21.0
|
19 |
+
astropy==7.0.1
|
20 |
+
astropy-iers-data==0.2025.4.28.0.37.27
|
21 |
+
astunparse==1.6.3
|
22 |
+
atpublic==5.1
|
23 |
+
attrs==25.3.0
|
24 |
+
audioread==3.0.1
|
25 |
+
autograd==1.7.0
|
26 |
+
babel==2.17.0
|
27 |
+
backcall==0.2.0
|
28 |
+
backports.tarfile==1.2.0
|
29 |
+
beautifulsoup4==4.13.4
|
30 |
+
betterproto==2.0.0b6
|
31 |
+
bigframes==2.1.0
|
32 |
+
bigquery-magics==0.9.0
|
33 |
+
biopython==1.85
|
34 |
+
bitarray==3.4.0
|
35 |
+
bleach==6.2.0
|
36 |
+
blinker==1.9.0
|
37 |
+
blis==1.2.1
|
38 |
+
blosc2==3.3.2
|
39 |
+
bokeh==3.7.2
|
40 |
+
Bottleneck==1.4.2
|
41 |
+
bqplot==0.12.44
|
42 |
+
branca==0.8.1
|
43 |
+
bs4==0.0.2
|
44 |
+
build==1.2.2.post1
|
45 |
+
CacheControl==0.14.3
|
46 |
+
cachetools==5.5.2
|
47 |
+
catalogue==2.0.10
|
48 |
+
certifi==2025.4.26
|
49 |
+
cffi==1.17.1
|
50 |
+
chardet==5.2.0
|
51 |
+
charset-normalizer==3.4.1
|
52 |
+
chex==0.1.89
|
53 |
+
clarabel==0.10.0
|
54 |
+
click==8.1.8
|
55 |
+
cloudpathlib==0.21.0
|
56 |
+
cloudpickle==3.1.1
|
57 |
+
cmake==3.31.6
|
58 |
+
cmdstanpy==1.2.5
|
59 |
+
colorcet==3.1.0
|
60 |
+
colorlover==0.3.0
|
61 |
+
colour==0.1.5
|
62 |
+
community==1.0.0b1
|
63 |
+
confection==0.1.5
|
64 |
+
cons==0.4.6
|
65 |
+
contourpy==1.3.2
|
66 |
+
cramjam==2.10.0
|
67 |
+
cryptography==43.0.3
|
68 |
+
cuda-python==12.6.2.post1
|
69 |
+
cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
|
70 |
+
cudf-polars-cu12==25.2.2
|
71 |
+
cufflinks==0.17.3
|
72 |
+
cuml-cu12==25.2.1
|
73 |
+
cupy-cuda12x==13.3.0
|
74 |
+
cuvs-cu12==25.2.1
|
75 |
+
cvxopt==1.3.2
|
76 |
+
cvxpy==1.6.5
|
77 |
+
cycler==0.12.1
|
78 |
+
cyipopt==1.5.0
|
79 |
+
cymem==2.0.11
|
80 |
+
Cython==3.0.12
|
81 |
+
dask==2024.12.1
|
82 |
+
dask-cuda==25.2.0
|
83 |
+
dask-cudf-cu12==25.2.2
|
84 |
+
dask-expr==1.1.21
|
85 |
+
dataproc-spark-connect==0.7.2
|
86 |
+
datascience==0.17.6
|
87 |
+
db-dtypes==1.4.2
|
88 |
+
dbus-python==1.2.18
|
89 |
+
debugpy==1.8.0
|
90 |
+
decorator==4.4.2
|
91 |
+
defusedxml==0.7.1
|
92 |
+
Deprecated==1.2.18
|
93 |
+
diffusers==0.33.1
|
94 |
+
distributed==2024.12.1
|
95 |
+
distributed-ucxx-cu12==0.42.0
|
96 |
+
distro==1.9.0
|
97 |
+
dlib==19.24.6
|
98 |
+
dm-tree==0.1.9
|
99 |
+
docker-pycreds==0.4.0
|
100 |
+
docstring_parser==0.16
|
101 |
+
docutils==0.21.2
|
102 |
+
dopamine_rl==4.1.2
|
103 |
+
duckdb==1.2.2
|
104 |
+
earthengine-api==1.5.13
|
105 |
+
easydict==1.13
|
106 |
+
editdistance==0.8.1
|
107 |
+
eerepr==0.1.1
|
108 |
+
einops==0.8.1
|
109 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
|
110 |
+
entrypoints==0.4
|
111 |
+
et_xmlfile==2.0.0
|
112 |
+
etils==1.12.2
|
113 |
+
etuples==0.3.9
|
114 |
+
Farama-Notifications==0.0.4
|
115 |
+
fastai==2.7.19
|
116 |
+
fastapi==0.115.12
|
117 |
+
fastcore==1.7.29
|
118 |
+
fastdownload==0.0.7
|
119 |
+
fastjsonschema==2.21.1
|
120 |
+
fastprogress==1.0.3
|
121 |
+
fastrlock==0.8.3
|
122 |
+
ffmpy==0.5.0
|
123 |
+
filelock==3.18.0
|
124 |
+
firebase-admin==6.8.0
|
125 |
+
Flask==3.1.0
|
126 |
+
flatbuffers==25.2.10
|
127 |
+
flax==0.10.6
|
128 |
+
folium==0.19.5
|
129 |
+
fonttools==4.57.0
|
130 |
+
frozendict==2.4.6
|
131 |
+
frozenlist==1.6.0
|
132 |
+
fsspec==2025.3.2
|
133 |
+
future==1.0.0
|
134 |
+
gast==0.6.0
|
135 |
+
gcsfs==2025.3.2
|
136 |
+
GDAL==3.6.4
|
137 |
+
gdown==5.2.0
|
138 |
+
geemap==0.35.3
|
139 |
+
gensim==4.3.3
|
140 |
+
geocoder==1.38.1
|
141 |
+
geographiclib==2.0
|
142 |
+
geopandas==1.0.1
|
143 |
+
geopy==2.4.1
|
144 |
+
gin-config==0.5.0
|
145 |
+
gitdb==4.0.12
|
146 |
+
GitPython==3.1.44
|
147 |
+
glob2==0.7
|
148 |
+
google==2.0.3
|
149 |
+
google-ai-generativelanguage==0.6.15
|
150 |
+
google-api-core==2.24.2
|
151 |
+
google-api-python-client==2.169.0
|
152 |
+
google-auth==2.38.0
|
153 |
+
google-auth-httplib2==0.2.0
|
154 |
+
google-auth-oauthlib==1.2.2
|
155 |
+
google-cloud-aiplatform==1.91.0
|
156 |
+
google-cloud-bigquery==3.31.0
|
157 |
+
google-cloud-bigquery-connection==1.18.2
|
158 |
+
google-cloud-bigquery-storage==2.31.0
|
159 |
+
google-cloud-bigtable==2.30.1
|
160 |
+
google-cloud-core==2.4.3
|
161 |
+
google-cloud-dataproc==5.18.1
|
162 |
+
google-cloud-datastore==2.21.0
|
163 |
+
google-cloud-firestore==2.20.2
|
164 |
+
google-cloud-functions==1.20.3
|
165 |
+
google-cloud-iam==2.19.0
|
166 |
+
google-cloud-language==2.17.1
|
167 |
+
google-cloud-pubsub==2.25.0
|
168 |
+
google-cloud-resource-manager==1.14.2
|
169 |
+
google-cloud-spanner==3.54.0
|
170 |
+
google-cloud-storage==2.19.0
|
171 |
+
google-cloud-translate==3.20.2
|
172 |
+
google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
|
173 |
+
google-crc32c==1.7.1
|
174 |
+
google-genai==1.13.0
|
175 |
+
google-generativeai==0.8.5
|
176 |
+
google-pasta==0.2.0
|
177 |
+
google-resumable-media==2.7.2
|
178 |
+
googleapis-common-protos==1.70.0
|
179 |
+
googledrivedownloader==1.1.0
|
180 |
+
gradio==5.29.0
|
181 |
+
gradio_client==1.10.0
|
182 |
+
graphviz==0.20.3
|
183 |
+
greenlet==3.2.1
|
184 |
+
groovy==0.1.2
|
185 |
+
grpc-google-iam-v1==0.14.2
|
186 |
+
grpc-interceptor==0.15.4
|
187 |
+
grpcio==1.71.0
|
188 |
+
grpcio-status==1.71.0
|
189 |
+
grpclib==0.4.7
|
190 |
+
gspread==6.2.0
|
191 |
+
gspread-dataframe==4.0.0
|
192 |
+
gym==0.25.2
|
193 |
+
gym-notices==0.0.8
|
194 |
+
gymnasium==1.1.1
|
195 |
+
h11==0.16.0
|
196 |
+
h2==4.2.0
|
197 |
+
h5netcdf==1.6.1
|
198 |
+
h5py==3.13.0
|
199 |
+
hdbscan==0.8.40
|
200 |
+
highspy==1.10.0
|
201 |
+
holidays==0.71
|
202 |
+
holoviews==1.20.2
|
203 |
+
hpack==4.1.0
|
204 |
+
html5lib==1.1
|
205 |
+
httpcore==1.0.9
|
206 |
+
httpimport==1.4.1
|
207 |
+
httplib2==0.22.0
|
208 |
+
httpx==0.28.1
|
209 |
+
huggingface-hub==0.30.2
|
210 |
+
humanize==4.12.3
|
211 |
+
hyperframe==6.1.0
|
212 |
+
hyperopt==0.2.7
|
213 |
+
ibis-framework==9.5.0
|
214 |
+
idna==3.10
|
215 |
+
imageio==2.37.0
|
216 |
+
imageio-ffmpeg==0.6.0
|
217 |
+
imagesize==1.4.1
|
218 |
+
imbalanced-learn==0.13.0
|
219 |
+
immutabledict==4.2.1
|
220 |
+
importlib_metadata==8.7.0
|
221 |
+
importlib_resources==6.5.2
|
222 |
+
imutils==0.5.4
|
223 |
+
inflect==7.5.0
|
224 |
+
iniconfig==2.1.0
|
225 |
+
intel-cmplr-lib-ur==2025.1.1
|
226 |
+
intel-openmp==2025.1.1
|
227 |
+
ipyevents==2.0.2
|
228 |
+
ipyfilechooser==0.6.0
|
229 |
+
ipykernel==6.17.1
|
230 |
+
ipyleaflet==0.19.2
|
231 |
+
ipyparallel==8.8.0
|
232 |
+
ipython==7.34.0
|
233 |
+
ipython-genutils==0.2.0
|
234 |
+
ipython-sql==0.5.0
|
235 |
+
ipytree==0.2.2
|
236 |
+
ipywidgets==7.7.1
|
237 |
+
itsdangerous==2.2.0
|
238 |
+
jaraco.classes==3.4.0
|
239 |
+
jaraco.context==6.0.1
|
240 |
+
jaraco.functools==4.1.0
|
241 |
+
jax==0.5.2
|
242 |
+
jax-cuda12-pjrt==0.5.1
|
243 |
+
jax-cuda12-plugin==0.5.1
|
244 |
+
jaxlib==0.5.1
|
245 |
+
jeepney==0.9.0
|
246 |
+
jieba==0.42.1
|
247 |
+
Jinja2==3.1.6
|
248 |
+
jiter==0.9.0
|
249 |
+
joblib==1.4.2
|
250 |
+
jsonpatch==1.33
|
251 |
+
jsonpickle==4.0.5
|
252 |
+
jsonpointer==3.0.0
|
253 |
+
jsonschema==4.23.0
|
254 |
+
jsonschema-specifications==2025.4.1
|
255 |
+
jupyter-client==6.1.12
|
256 |
+
jupyter-console==6.1.0
|
257 |
+
jupyter-leaflet==0.19.2
|
258 |
+
jupyter-server==1.16.0
|
259 |
+
jupyter_core==5.7.2
|
260 |
+
jupyter_kernel_gateway @ git+https://github.com/googlecolab/kernel_gateway@b134e9945df25c2dcb98ade9129399be10788671
|
261 |
+
jupyterlab_pygments==0.3.0
|
262 |
+
jupyterlab_widgets==3.0.14
|
263 |
+
kaggle==1.7.4.2
|
264 |
+
kagglehub==0.3.12
|
265 |
+
keras==3.8.0
|
266 |
+
keras-hub==0.18.1
|
267 |
+
keras-nlp==0.18.1
|
268 |
+
keyring==25.6.0
|
269 |
+
keyrings.google-artifactregistry-auth==1.1.2
|
270 |
+
kiwisolver==1.4.8
|
271 |
+
langchain==0.3.24
|
272 |
+
langchain-core==0.3.56
|
273 |
+
langchain-text-splitters==0.3.8
|
274 |
+
langcodes==3.5.0
|
275 |
+
langsmith==0.3.39
|
276 |
+
language_data==1.3.0
|
277 |
+
launchpadlib==1.10.16
|
278 |
+
lazr.restfulclient==0.14.4
|
279 |
+
lazr.uri==1.0.6
|
280 |
+
lazy_loader==0.4
|
281 |
+
libclang==18.1.1
|
282 |
+
libcudf-cu12 @ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-25.2.1-py3-none-manylinux_2_28_x86_64.whl
|
283 |
+
libcugraph-cu12==25.2.0
|
284 |
+
libcuml-cu12==25.2.1
|
285 |
+
libcuvs-cu12==25.2.1
|
286 |
+
libkvikio-cu12==25.2.1
|
287 |
+
libraft-cu12==25.2.0
|
288 |
+
librosa==0.11.0
|
289 |
+
libucx-cu12==1.18.1
|
290 |
+
libucxx-cu12==0.42.0
|
291 |
+
lightgbm @ file:///tmp/lightgbm/LightGBM/dist/lightgbm-4.5.0-py3-none-linux_x86_64.whl
|
292 |
+
linkify-it-py==2.0.3
|
293 |
+
llvmlite==0.43.0
|
294 |
+
locket==1.0.0
|
295 |
+
logical-unification==0.4.6
|
296 |
+
lxml==5.4.0
|
297 |
+
Mako==1.1.3
|
298 |
+
marisa-trie==1.2.1
|
299 |
+
Markdown==3.8
|
300 |
+
markdown-it-py==3.0.0
|
301 |
+
MarkupSafe==3.0.2
|
302 |
+
matplotlib==3.10.0
|
303 |
+
matplotlib-inline==0.1.7
|
304 |
+
matplotlib-venn==1.1.2
|
305 |
+
mdit-py-plugins==0.4.2
|
306 |
+
mdurl==0.1.2
|
307 |
+
miniKanren==1.0.3
|
308 |
+
missingno==0.5.2
|
309 |
+
mistune==3.1.3
|
310 |
+
mizani==0.13.3
|
311 |
+
mkl==2025.0.1
|
312 |
+
ml-dtypes==0.4.1
|
313 |
+
mlxtend==0.23.4
|
314 |
+
more-itertools==10.7.0
|
315 |
+
moviepy==1.0.3
|
316 |
+
mpmath==1.3.0
|
317 |
+
msgpack==1.1.0
|
318 |
+
multidict==6.4.3
|
319 |
+
multipledispatch==1.0.0
|
320 |
+
multitasking==0.0.11
|
321 |
+
murmurhash==1.0.12
|
322 |
+
music21==9.3.0
|
323 |
+
namex==0.0.9
|
324 |
+
narwhals==1.37.1
|
325 |
+
natsort==8.4.0
|
326 |
+
nbclassic==1.3.0
|
327 |
+
nbclient==0.10.2
|
328 |
+
nbconvert==7.16.6
|
329 |
+
nbformat==5.10.4
|
330 |
+
ndindex==1.9.2
|
331 |
+
nest-asyncio==1.6.0
|
332 |
+
networkx==3.4.2
|
333 |
+
nibabel==5.3.2
|
334 |
+
nltk==3.9.1
|
335 |
+
notebook==6.5.7
|
336 |
+
notebook_shim==0.2.4
|
337 |
+
numba==0.60.0
|
338 |
+
numba-cuda==0.2.0
|
339 |
+
numexpr==2.10.2
|
340 |
+
numpy==1.25.2
|
341 |
+
nvidia-cublas-cu12==12.4.5.8
|
342 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
343 |
+
nvidia-cuda-nvcc-cu12==12.5.82
|
344 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
345 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
346 |
+
nvidia-cudnn-cu12==9.1.0.70
|
347 |
+
nvidia-cufft-cu12==11.2.1.3
|
348 |
+
nvidia-curand-cu12==10.3.5.147
|
349 |
+
nvidia-cusolver-cu12==11.6.1.9
|
350 |
+
nvidia-cusparse-cu12==12.3.1.170
|
351 |
+
nvidia-cusparselt-cu12==0.6.2
|
352 |
+
nvidia-ml-py==12.570.86
|
353 |
+
nvidia-nccl-cu12==2.21.5
|
354 |
+
nvidia-nvcomp-cu12==4.2.0.11
|
355 |
+
nvidia-nvjitlink-cu12==12.4.127
|
356 |
+
nvidia-nvtx-cu12==12.4.127
|
357 |
+
nvtx==0.2.11
|
358 |
+
nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-25.2.0-py3-none-any.whl
|
359 |
+
oauth2client==4.1.3
|
360 |
+
oauthlib==3.2.2
|
361 |
+
openai==1.76.2
|
362 |
+
opencv-contrib-python==4.11.0.86
|
363 |
+
opencv-python==4.11.0.86
|
364 |
+
opencv-python-headless==4.11.0.86
|
365 |
+
openpyxl==3.1.5
|
366 |
+
opentelemetry-api==1.16.0
|
367 |
+
opentelemetry-sdk==1.16.0
|
368 |
+
opentelemetry-semantic-conventions==0.37b0
|
369 |
+
opt_einsum==3.4.0
|
370 |
+
optax==0.2.4
|
371 |
+
optree==0.15.0
|
372 |
+
orbax-checkpoint==0.11.13
|
373 |
+
orjson==3.10.18
|
374 |
+
osqp==1.0.3
|
375 |
+
packaging==24.2
|
376 |
+
pandas==2.2.2
|
377 |
+
pandas-datareader==0.10.0
|
378 |
+
pandas-gbq==0.28.0
|
379 |
+
pandas-stubs==2.2.2.240909
|
380 |
+
pandocfilters==1.5.1
|
381 |
+
panel==1.6.3
|
382 |
+
param==2.2.0
|
383 |
+
parso==0.8.4
|
384 |
+
parsy==2.1
|
385 |
+
partd==1.4.2
|
386 |
+
pathlib==1.0.1
|
387 |
+
patsy==1.0.1
|
388 |
+
pdfreader==0.1.15
|
389 |
+
peewee==3.18.1
|
390 |
+
peft==0.15.2
|
391 |
+
pexpect==4.9.0
|
392 |
+
pickleshare==0.7.5
|
393 |
+
pillow==11.2.1
|
394 |
+
platformdirs==4.3.7
|
395 |
+
plotly==5.24.1
|
396 |
+
plotnine==0.14.5
|
397 |
+
pluggy==1.5.0
|
398 |
+
plum-dispatch==1.7.4
|
399 |
+
ply==3.11
|
400 |
+
polars==1.21.0
|
401 |
+
pooch==1.8.2
|
402 |
+
portpicker==1.5.2
|
403 |
+
preshed==3.0.9
|
404 |
+
prettytable==3.16.0
|
405 |
+
proglog==0.1.11
|
406 |
+
progressbar2==4.5.0
|
407 |
+
prometheus_client==0.21.1
|
408 |
+
promise==2.3
|
409 |
+
prompt_toolkit==3.0.51
|
410 |
+
propcache==0.3.1
|
411 |
+
prophet==1.1.6
|
412 |
+
proto-plus==1.26.1
|
413 |
+
protobuf==5.29.4
|
414 |
+
psutil==5.9.5
|
415 |
+
psycopg2==2.9.10
|
416 |
+
ptyprocess==0.7.0
|
417 |
+
py-cpuinfo==9.0.0
|
418 |
+
py4j==0.10.9.7
|
419 |
+
pyarrow==18.1.0
|
420 |
+
pyasn1==0.6.1
|
421 |
+
pyasn1_modules==0.4.2
|
422 |
+
pycairo==1.28.0
|
423 |
+
pycocotools==2.0.8
|
424 |
+
pycparser==2.22
|
425 |
+
pycryptodome==3.22.0
|
426 |
+
pydantic==2.11.4
|
427 |
+
pydantic_core==2.33.2
|
428 |
+
pydata-google-auth==1.9.1
|
429 |
+
pydot==3.0.4
|
430 |
+
pydotplus==2.0.2
|
431 |
+
PyDrive==1.3.1
|
432 |
+
PyDrive2==1.21.3
|
433 |
+
pydub==0.25.1
|
434 |
+
pyerfa==2.0.1.5
|
435 |
+
pygame==2.6.1
|
436 |
+
pygit2==1.18.0
|
437 |
+
Pygments==2.19.1
|
438 |
+
PyGObject==3.42.0
|
439 |
+
PyJWT==2.10.1
|
440 |
+
pylibcudf-cu12 @ https://pypi.nvidia.com/pylibcudf-cu12/pylibcudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
|
441 |
+
pylibcugraph-cu12==25.2.0
|
442 |
+
pylibraft-cu12==25.2.0
|
443 |
+
pymc==5.22.0
|
444 |
+
PyMuPDF==1.25.5
|
445 |
+
pymystem3==0.2.0
|
446 |
+
pynndescent==0.5.13
|
447 |
+
pynvjitlink-cu12==0.5.2
|
448 |
+
pynvml==12.0.0
|
449 |
+
pyogrio==0.10.0
|
450 |
+
pyomo==6.9.2
|
451 |
+
PyOpenGL==3.1.9
|
452 |
+
pyOpenSSL==24.2.1
|
453 |
+
pyparsing==3.2.3
|
454 |
+
pyperclip==1.9.0
|
455 |
+
pyproj==3.7.1
|
456 |
+
pyproject_hooks==1.2.0
|
457 |
+
pyshp==2.3.1
|
458 |
+
PySocks==1.7.1
|
459 |
+
pyspark==3.5.1
|
460 |
+
pytensor==2.30.3
|
461 |
+
pytest==8.3.5
|
462 |
+
python-apt==0.0.0
|
463 |
+
python-box==7.3.2
|
464 |
+
python-dateutil==2.9.0.post0
|
465 |
+
python-louvain==0.16
|
466 |
+
python-multipart==0.0.20
|
467 |
+
python-slugify==8.0.4
|
468 |
+
python-snappy==0.7.3
|
469 |
+
python-utils==3.9.1
|
470 |
+
pytz==2025.2
|
471 |
+
pyviz_comms==3.0.4
|
472 |
+
PyYAML==6.0.2
|
473 |
+
pyzmq==24.0.1
|
474 |
+
raft-dask-cu12==25.2.0
|
475 |
+
RapidFuzz==3.13.0
|
476 |
+
rapids-dask-dependency==25.2.0
|
477 |
+
ratelim==0.1.6
|
478 |
+
referencing==0.36.2
|
479 |
+
regex==2024.11.6
|
480 |
+
requests==2.32.3
|
481 |
+
requests-oauthlib==2.0.0
|
482 |
+
requests-toolbelt==1.0.0
|
483 |
+
requirements-parser==0.9.0
|
484 |
+
rich==13.9.4
|
485 |
+
rmm-cu12==25.2.0
|
486 |
+
roman-numerals-py==3.1.0
|
487 |
+
rpds-py==0.24.0
|
488 |
+
rpy2==3.5.17
|
489 |
+
rsa==4.9.1
|
490 |
+
ruff==0.11.9
|
491 |
+
safehttpx==0.1.6
|
492 |
+
safetensors==0.5.3
|
493 |
+
scikit-image==0.25.2
|
494 |
+
scikit-learn==1.6.1
|
495 |
+
scipy==1.13.1
|
496 |
+
scooby==0.10.1
|
497 |
+
scs==3.2.7.post2
|
498 |
+
seaborn==0.13.2
|
499 |
+
SecretStorage==3.3.3
|
500 |
+
semantic-version==2.10.0
|
501 |
+
Send2Trash==1.8.3
|
502 |
+
sentence-transformers==3.4.1
|
503 |
+
sentencepiece==0.2.0
|
504 |
+
sentry-sdk==2.27.0
|
505 |
+
setproctitle==1.3.6
|
506 |
+
shap==0.47.2
|
507 |
+
shapely==2.1.0
|
508 |
+
shellingham==1.5.4
|
509 |
+
simple-parsing==0.1.7
|
510 |
+
simplejson==3.20.1
|
511 |
+
simsimd==6.2.1
|
512 |
+
six==1.17.0
|
513 |
+
sklearn-compat==0.1.3
|
514 |
+
sklearn-pandas==2.2.0
|
515 |
+
slicer==0.0.8
|
516 |
+
smart-open==7.1.0
|
517 |
+
smmap==5.0.2
|
518 |
+
sniffio==1.3.1
|
519 |
+
snowballstemmer==2.2.0
|
520 |
+
sortedcontainers==2.4.0
|
521 |
+
soundfile==0.13.1
|
522 |
+
soupsieve==2.7
|
523 |
+
soxr==0.5.0.post1
|
524 |
+
spacy==3.8.5
|
525 |
+
spacy-legacy==3.0.12
|
526 |
+
spacy-loggers==1.0.5
|
527 |
+
spacy-lookups-data==1.0.5
|
528 |
+
spanner-graph-notebook==1.1.6
|
529 |
+
Sphinx==8.2.3
|
530 |
+
sphinxcontrib-applehelp==2.0.0
|
531 |
+
sphinxcontrib-devhelp==2.0.0
|
532 |
+
sphinxcontrib-htmlhelp==2.1.0
|
533 |
+
sphinxcontrib-jsmath==1.0.1
|
534 |
+
sphinxcontrib-qthelp==2.0.0
|
535 |
+
sphinxcontrib-serializinghtml==2.0.0
|
536 |
+
spire-doc==13.4.6
|
537 |
+
Spire.Xls==14.12.0
|
538 |
+
SQLAlchemy==2.0.40
|
539 |
+
sqlglot==25.20.2
|
540 |
+
sqlparse==0.5.3
|
541 |
+
srsly==2.5.1
|
542 |
+
stanio==0.5.1
|
543 |
+
starlette==0.46.2
|
544 |
+
statsmodels==0.14.4
|
545 |
+
stringzilla==3.12.5
|
546 |
+
sympy==1.13.1
|
547 |
+
tables==3.10.2
|
548 |
+
tabula-py==2.10.0
|
549 |
+
tabulate==0.9.0
|
550 |
+
tbb==2022.1.0
|
551 |
+
tblib==3.1.0
|
552 |
+
tcmlib==1.3.0
|
553 |
+
tenacity==9.1.2
|
554 |
+
tensorboard==2.18.0
|
555 |
+
tensorboard-data-server==0.7.2
|
556 |
+
tensorflow==2.18.0
|
557 |
+
tensorflow-datasets==4.9.8
|
558 |
+
tensorflow-hub==0.16.1
|
559 |
+
tensorflow-io-gcs-filesystem==0.37.1
|
560 |
+
tensorflow-metadata==1.17.1
|
561 |
+
tensorflow-probability==0.25.0
|
562 |
+
tensorflow-text==2.18.1
|
563 |
+
tensorflow_decision_forests==1.11.0
|
564 |
+
tensorstore==0.1.74
|
565 |
+
termcolor==3.1.0
|
566 |
+
terminado==0.18.1
|
567 |
+
text-unidecode==1.3
|
568 |
+
textblob==0.19.0
|
569 |
+
tf-slim==1.1.0
|
570 |
+
tf_keras==2.18.0
|
571 |
+
thefuzz==0.22.1
|
572 |
+
thinc==8.3.4
|
573 |
+
threadpoolctl==3.6.0
|
574 |
+
tifffile==2025.3.30
|
575 |
+
timm==1.0.15
|
576 |
+
tinycss2==1.4.0
|
577 |
+
tokenizers==0.21.1
|
578 |
+
toml==0.10.2
|
579 |
+
tomlkit==0.13.2
|
580 |
+
toolz==0.12.1
|
581 |
+
torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
|
582 |
+
torchaudio @ https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
|
583 |
+
torchsummary==1.5.1
|
584 |
+
torchvision @ https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl
|
585 |
+
tornado==6.4.2
|
586 |
+
tqdm==4.67.1
|
587 |
+
traitlets==5.7.1
|
588 |
+
traittypes==0.2.1
|
589 |
+
transformers==4.51.3
|
590 |
+
treelite==4.4.1
|
591 |
+
treescope==0.1.9
|
592 |
+
triton==3.2.0
|
593 |
+
tweepy==4.15.0
|
594 |
+
typeguard==4.4.2
|
595 |
+
typer==0.15.3
|
596 |
+
types-pytz==2025.2.0.20250326
|
597 |
+
types-setuptools==80.3.0.20250505
|
598 |
+
typing-inspection==0.4.0
|
599 |
+
typing_extensions==4.13.2
|
600 |
+
tzdata==2025.2
|
601 |
+
tzlocal==5.3.1
|
602 |
+
uc-micro-py==1.0.3
|
603 |
+
ucx-py-cu12==0.42.0
|
604 |
+
ucxx-cu12==0.42.0
|
605 |
+
umap-learn==0.5.7
|
606 |
+
umf==0.10.0
|
607 |
+
uritemplate==4.1.1
|
608 |
+
urllib3==2.4.0
|
609 |
+
uvicorn==0.34.2
|
610 |
+
vega-datasets==0.9.0
|
611 |
+
wadllib==1.3.6
|
612 |
+
wandb==0.19.10
|
613 |
+
wasabi==1.1.3
|
614 |
+
wcwidth==0.2.13
|
615 |
+
weasel==0.4.1
|
616 |
+
webcolors==24.11.1
|
617 |
+
webencodings==0.5.1
|
618 |
+
websocket-client==1.8.0
|
619 |
+
websockets==15.0.1
|
620 |
+
Werkzeug==3.1.3
|
621 |
+
widgetsnbextension==3.6.10
|
622 |
+
wordcloud==1.9.4
|
623 |
+
wordsegment==1.3.1
|
624 |
+
wrapt==1.17.2
|
625 |
+
wurlitzer==3.1.1
|
626 |
+
xarray==2025.3.1
|
627 |
+
xarray-einstats==0.8.0
|
628 |
+
xgboost==2.1.4
|
629 |
+
xlrd==2.0.1
|
630 |
+
xyzservices==2025.4.0
|
631 |
+
yarl==1.20.0
|
632 |
+
ydf==0.11.0
|
633 |
+
yellowbrick==1.5
|
634 |
+
yfinance==0.2.57
|
635 |
+
zict==3.0.0
|
636 |
+
zipp==3.21.0
|
637 |
+
zstandard==0.23.0
|
mtdna_backend.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from collections import Counter
|
3 |
+
import csv
|
4 |
+
import os
|
5 |
+
from functools import lru_cache
|
6 |
+
from mtdna_classifier import classify_sample_location
|
7 |
+
import subprocess
|
8 |
+
import json
|
9 |
+
import pandas as pd
|
10 |
+
import io
|
11 |
+
import re
|
12 |
+
import tempfile
|
13 |
+
import gspread
|
14 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
15 |
+
from io import StringIO
|
16 |
+
|
17 |
+
@lru_cache(maxsize=128)
|
18 |
+
def classify_sample_location_cached(accession):
|
19 |
+
return classify_sample_location(accession)
|
20 |
+
|
21 |
+
# Count and suggest final location
|
22 |
+
def compute_final_suggested_location(rows):
|
23 |
+
candidates = [
|
24 |
+
row.get("Predicted Location", "").strip()
|
25 |
+
for row in rows
|
26 |
+
if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
|
27 |
+
] + [
|
28 |
+
row.get("Inferred Region", "").strip()
|
29 |
+
for row in rows
|
30 |
+
if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
|
31 |
+
]
|
32 |
+
|
33 |
+
if not candidates:
|
34 |
+
return Counter(), ("Unknown", 0)
|
35 |
+
# Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
|
36 |
+
tokens = []
|
37 |
+
for item in candidates:
|
38 |
+
# Split by comma, whitespace, and newlines
|
39 |
+
parts = re.split(r'[\s,]+', item)
|
40 |
+
tokens.extend(parts)
|
41 |
+
|
42 |
+
# Step 2: Clean and normalize tokens
|
43 |
+
tokens = [word.strip() for word in tokens if word.strip().isalpha()] # Keep only alphabetic tokens
|
44 |
+
|
45 |
+
# Step 3: Count
|
46 |
+
counts = Counter(tokens)
|
47 |
+
|
48 |
+
# Step 4: Get most common
|
49 |
+
top_location, count = counts.most_common(1)[0]
|
50 |
+
return counts, (top_location, count)
|
51 |
+
|
52 |
+
# Store feedback (with required fields)
|
53 |
+
|
54 |
+
def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
|
55 |
+
if not answer1.strip() or not answer2.strip():
|
56 |
+
return "⚠️ Please answer both questions before submitting."
|
57 |
+
|
58 |
+
try:
|
59 |
+
# ✅ Step: Load credentials from Hugging Face secret
|
60 |
+
creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
61 |
+
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
|
62 |
+
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
|
63 |
+
|
64 |
+
# Connect to Google Sheet
|
65 |
+
client = gspread.authorize(creds)
|
66 |
+
sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
|
67 |
+
|
68 |
+
# Append feedback
|
69 |
+
sheet.append_row([accession, answer1, answer2, contact])
|
70 |
+
return "✅ Feedback submitted. Thank you!"
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
return f"❌ Error submitting feedback: {e}"
|
74 |
+
|
75 |
+
# helper function to extract accessions
|
76 |
+
def extract_accessions_from_input(file=None, raw_text=""):
|
77 |
+
print(f"RAW TEXT RECEIVED: {raw_text}")
|
78 |
+
accessions = []
|
79 |
+
seen = set()
|
80 |
+
if file:
|
81 |
+
try:
|
82 |
+
if file.name.endswith(".csv"):
|
83 |
+
df = pd.read_csv(file)
|
84 |
+
elif file.name.endswith(".xlsx"):
|
85 |
+
df = pd.read_excel(file)
|
86 |
+
else:
|
87 |
+
return [], "Unsupported file format. Please upload CSV or Excel."
|
88 |
+
for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
|
89 |
+
if acc not in seen:
|
90 |
+
accessions.append(acc)
|
91 |
+
seen.add(acc)
|
92 |
+
except Exception as e:
|
93 |
+
return [], f"Failed to read file: {e}"
|
94 |
+
|
95 |
+
if raw_text:
|
96 |
+
text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
|
97 |
+
for acc in text_ids:
|
98 |
+
if acc not in seen:
|
99 |
+
accessions.append(acc)
|
100 |
+
seen.add(acc)
|
101 |
+
|
102 |
+
return list(accessions), None
|
103 |
+
|
104 |
+
def summarize_results(accession):
|
105 |
+
try:
|
106 |
+
output, labelAncient_Modern, explain_label = classify_sample_location_cached(accession)
|
107 |
+
#print(output)
|
108 |
+
except Exception as e:
|
109 |
+
return [], f"Error: {e}", f"Error: {e}", f"Error: {e}"
|
110 |
+
|
111 |
+
if accession not in output:
|
112 |
+
return [], "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
|
113 |
+
|
114 |
+
isolate = next((k for k in output if k != accession), None)
|
115 |
+
row_score = []
|
116 |
+
rows = []
|
117 |
+
|
118 |
+
for key in [accession, isolate]:
|
119 |
+
if key not in output:
|
120 |
+
continue
|
121 |
+
sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
|
122 |
+
for section, techniques in output[key].items():
|
123 |
+
for technique, content in techniques.items():
|
124 |
+
source = content.get("source", "")
|
125 |
+
predicted = content.get("predicted_location", "")
|
126 |
+
haplogroup = content.get("haplogroup", "")
|
127 |
+
inferred = content.get("inferred_location", "")
|
128 |
+
context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
|
129 |
+
|
130 |
+
row = {
|
131 |
+
"Sample ID": sample_id_label,
|
132 |
+
"Technique": technique,
|
133 |
+
"Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
|
134 |
+
"Predicted Location": "" if technique == "haplogroup" else predicted,
|
135 |
+
"Haplogroup": haplogroup if technique == "haplogroup" else "",
|
136 |
+
"Inferred Region": inferred if technique == "haplogroup" else "",
|
137 |
+
"Context Snippet": context
|
138 |
+
}
|
139 |
+
|
140 |
+
row_score.append(row)
|
141 |
+
rows.append(list(row.values()))
|
142 |
+
|
143 |
+
location_counts, (final_location, count) = compute_final_suggested_location(row_score)
|
144 |
+
summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
|
145 |
+
summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
|
146 |
+
summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
|
147 |
+
summary = "\n".join(summary_lines)
|
148 |
+
return rows, summary, labelAncient_Modern, explain_label
|
149 |
+
|
150 |
+
# save the batch input in excel file
|
151 |
+
def save_to_excel(all_rows, summary_text, flag_text, filename):
|
152 |
+
with pd.ExcelWriter(filename) as writer:
|
153 |
+
# Save table
|
154 |
+
df = pd.DataFrame(all_rows, columns=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"])
|
155 |
+
df.to_excel(writer, sheet_name="Detailed Results", index=False)
|
156 |
+
|
157 |
+
# Save summary
|
158 |
+
summary_df = pd.DataFrame({"Summary": [summary_text]})
|
159 |
+
summary_df.to_excel(writer, sheet_name="Summary", index=False)
|
160 |
+
|
161 |
+
# Save flag
|
162 |
+
flag_df = pd.DataFrame({"Flag": [flag_text]})
|
163 |
+
flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
|
164 |
+
|
165 |
+
# save the batch input in JSON file
|
166 |
+
def save_to_json(all_rows, summary_text, flag_text, filename):
|
167 |
+
output_dict = {
|
168 |
+
"Detailed_Results": all_rows, # <-- make sure this is a plain list, not a DataFrame
|
169 |
+
"Summary_Text": summary_text,
|
170 |
+
"Ancient_Modern_Flag": flag_text
|
171 |
+
}
|
172 |
+
|
173 |
+
# If all_rows is a DataFrame, convert it
|
174 |
+
if isinstance(all_rows, pd.DataFrame):
|
175 |
+
output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
|
176 |
+
|
177 |
+
with open(filename, "w") as external_file:
|
178 |
+
json.dump(output_dict, external_file, indent=2)
|
179 |
+
|
180 |
+
# save the batch input in Text file
|
181 |
+
def save_to_txt(all_rows, summary_text, flag_text, filename):
|
182 |
+
if isinstance(all_rows, pd.DataFrame):
|
183 |
+
detailed_results = all_rows.to_dict(orient="records")
|
184 |
+
output = ""
|
185 |
+
output += ",".join(list(detailed_results[0].keys())) + "\n\n"
|
186 |
+
for r in detailed_results:
|
187 |
+
output += ",".join([str(v) for v in r.values()]) + "\n\n"
|
188 |
+
with open(filename, "w") as f:
|
189 |
+
f.write("=== Detailed Results ===\n")
|
190 |
+
f.write(output + "\n")
|
191 |
+
|
192 |
+
f.write("\n=== Summary ===\n")
|
193 |
+
f.write(summary_text + "\n")
|
194 |
+
|
195 |
+
f.write("\n=== Ancient/Modern Flag ===\n")
|
196 |
+
f.write(flag_text + "\n")
|
197 |
+
|
198 |
+
def save_batch_output(all_rows, summary_text, flag_text, output_type):
|
199 |
+
tmp_dir = tempfile.mkdtemp()
|
200 |
+
|
201 |
+
#html_table = all_rows.value # assuming this is stored somewhere
|
202 |
+
|
203 |
+
# Parse back to DataFrame
|
204 |
+
#all_rows = pd.read_html(all_rows)[0] # [0] because read_html returns a list
|
205 |
+
all_rows = pd.read_html(StringIO(all_rows))[0]
|
206 |
+
print(all_rows)
|
207 |
+
|
208 |
+
if output_type == "Excel":
|
209 |
+
file_path = f"{tmp_dir}/batch_output.xlsx"
|
210 |
+
save_to_excel(all_rows, summary_text, flag_text, file_path)
|
211 |
+
elif output_type == "JSON":
|
212 |
+
file_path = f"{tmp_dir}/batch_output.json"
|
213 |
+
save_to_json(all_rows, summary_text, flag_text, file_path)
|
214 |
+
print("Done with JSON")
|
215 |
+
elif output_type == "TXT":
|
216 |
+
file_path = f"{tmp_dir}/batch_output.txt"
|
217 |
+
save_to_txt(all_rows, summary_text, flag_text, file_path)
|
218 |
+
else:
|
219 |
+
return gr.update(visible=False) # invalid option
|
220 |
+
|
221 |
+
return gr.update(value=file_path, visible=True)
|
222 |
+
|
223 |
+
# run the batch
|
224 |
+
def summarize_batch(file=None, raw_text=""):
|
225 |
+
accessions, error = extract_accessions_from_input(file, raw_text)
|
226 |
+
if error:
|
227 |
+
return [], "", "", f"Error: {error}"
|
228 |
+
|
229 |
+
all_rows = []
|
230 |
+
all_summaries = []
|
231 |
+
all_flags = []
|
232 |
+
|
233 |
+
for acc in accessions:
|
234 |
+
try:
|
235 |
+
rows, summary, label, explain = summarize_results(acc)
|
236 |
+
all_rows.extend(rows)
|
237 |
+
all_summaries.append(f"**{acc}**\n{summary}")
|
238 |
+
all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
|
239 |
+
except Exception as e:
|
240 |
+
all_summaries.append(f"**{acc}**: Failed - {e}")
|
241 |
+
|
242 |
+
"""for row in all_rows:
|
243 |
+
source_column = row[2] # Assuming the "Source" is in the 3rd column (index 2)
|
244 |
+
|
245 |
+
if source_column.startswith("http"): # Check if the source is a URL
|
246 |
+
# Wrap it with HTML anchor tags to make it clickable
|
247 |
+
row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
|
248 |
+
|
249 |
+
|
250 |
+
summary_text = "\n\n---\n\n".join(all_summaries)
|
251 |
+
flag_text = "\n\n---\n\n".join(all_flags)
|
252 |
+
return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
|
mtdna_classifier.py
CHANGED
@@ -1,322 +1,519 @@
|
|
1 |
-
# mtDNA Location Classifier MVP (Google Colab)
|
2 |
-
# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
|
3 |
-
import os
|
4 |
-
import subprocess
|
5 |
-
import re
|
6 |
-
from Bio import Entrez
|
7 |
-
import fitz
|
8 |
-
import spacy
|
9 |
-
from spacy.cli import download
|
10 |
-
from NER.PDF import pdf
|
11 |
-
from NER.WordDoc import wordDoc
|
12 |
-
from NER.html import extractHTML
|
13 |
-
from NER.word2Vec import word2vec
|
14 |
-
from transformers import pipeline
|
15 |
-
|
16 |
-
|
17 |
-
import
|
18 |
-
|
19 |
-
|
20 |
-
nltk
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
''
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
Entrez
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
if
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
#
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
return
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
#
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
"
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# mtDNA Location Classifier MVP (Google Colab)
|
2 |
+
# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
|
3 |
+
import os
|
4 |
+
import subprocess
|
5 |
+
import re
|
6 |
+
from Bio import Entrez
|
7 |
+
import fitz
|
8 |
+
import spacy
|
9 |
+
from spacy.cli import download
|
10 |
+
from NER.PDF import pdf
|
11 |
+
from NER.WordDoc import wordDoc
|
12 |
+
from NER.html import extractHTML
|
13 |
+
from NER.word2Vec import word2vec
|
14 |
+
from transformers import pipeline
|
15 |
+
import urllib.parse, requests
|
16 |
+
from pathlib import Path
|
17 |
+
from upgradeClassify import filter_context_for_sample, infer_location_for_sample
|
18 |
+
# Set your email (required by NCBI Entrez)
|
19 |
+
#Entrez.email = "[email protected]"
|
20 |
+
import nltk
|
21 |
+
|
22 |
+
nltk.download("stopwords")
|
23 |
+
nltk.download("punkt")
|
24 |
+
nltk.download('punkt_tab')
|
25 |
+
# Step 1: Get PubMed ID from Accession using EDirect
|
26 |
+
|
27 |
+
'''def get_info_from_accession(accession):
|
28 |
+
cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
|
29 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
30 |
+
output = result.stdout
|
31 |
+
pubmedID, isolate = "", ""
|
32 |
+
for line in output.split("\n"):
|
33 |
+
if len(line) > 0:
|
34 |
+
if "PUBMED" in line:
|
35 |
+
pubmedID = line.split()[-1]
|
36 |
+
if "isolate" in line: # Check for isolate information
|
37 |
+
# Try direct GenBank annotation: /isolate="XXX"
|
38 |
+
match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
|
39 |
+
if match1:
|
40 |
+
isolate = match1.group(1)
|
41 |
+
else:
|
42 |
+
# Try from DEFINITION line: ...isolate XXX...
|
43 |
+
match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
|
44 |
+
if match2:
|
45 |
+
isolate = match2.group(1)'''
|
46 |
+
from Bio import Entrez, Medline
|
47 |
+
import re
|
48 |
+
|
49 |
+
Entrez.email = "[email protected]"
|
50 |
+
|
51 |
+
def get_info_from_accession(accession):
|
52 |
+
try:
|
53 |
+
handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
|
54 |
+
text = handle.read()
|
55 |
+
handle.close()
|
56 |
+
|
57 |
+
# Extract PUBMED ID from the Medline text
|
58 |
+
pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
|
59 |
+
pubmed_id = pubmed_match.group(1) if pubmed_match else ""
|
60 |
+
|
61 |
+
# Extract isolate if available
|
62 |
+
isolate_match = re.search(r'/isolate="([^"]+)"', text)
|
63 |
+
if not isolate_match:
|
64 |
+
isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
|
65 |
+
isolate = isolate_match.group(1) if isolate_match else ""
|
66 |
+
|
67 |
+
if not pubmed_id:
|
68 |
+
print(f"⚠️ No PubMed ID found for accession {accession}")
|
69 |
+
|
70 |
+
return pubmed_id, isolate
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
print("❌ Entrez error:", e)
|
74 |
+
return "", ""
|
75 |
+
# Step 2: Get doi link to access the paper
|
76 |
+
'''def get_doi_from_pubmed_id(pubmed_id):
|
77 |
+
cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
|
78 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
79 |
+
output = result.stdout
|
80 |
+
|
81 |
+
doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
|
82 |
+
match = re.search(doi_pattern, output, re.IGNORECASE)
|
83 |
+
|
84 |
+
if match:
|
85 |
+
return match.group(0)
|
86 |
+
else:
|
87 |
+
return None # or raise an Exception with a helpful message'''
|
88 |
+
|
89 |
+
def get_doi_from_pubmed_id(pubmed_id):
|
90 |
+
try:
|
91 |
+
handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
|
92 |
+
records = list(Medline.parse(handle))
|
93 |
+
handle.close()
|
94 |
+
|
95 |
+
if not records:
|
96 |
+
return None
|
97 |
+
|
98 |
+
record = records[0]
|
99 |
+
if "AID" in record:
|
100 |
+
for aid in record["AID"]:
|
101 |
+
if "[doi]" in aid:
|
102 |
+
return aid.split(" ")[0] # extract the DOI
|
103 |
+
|
104 |
+
return None
|
105 |
+
|
106 |
+
except Exception as e:
|
107 |
+
print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
|
108 |
+
return None
|
109 |
+
|
110 |
+
|
111 |
+
# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
|
112 |
+
# Step 3.1: Extract Text
|
113 |
+
# sub: download excel file
|
114 |
+
def download_excel_file(url, save_path="temp.xlsx"):
|
115 |
+
if "view.officeapps.live.com" in url:
|
116 |
+
parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
|
117 |
+
real_url = urllib.parse.unquote(parsed_url["src"][0])
|
118 |
+
response = requests.get(real_url)
|
119 |
+
with open(save_path, "wb") as f:
|
120 |
+
f.write(response.content)
|
121 |
+
return save_path
|
122 |
+
elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
|
123 |
+
response = requests.get(url)
|
124 |
+
response.raise_for_status() # Raises error if download fails
|
125 |
+
with open(save_path, "wb") as f:
|
126 |
+
f.write(response.content)
|
127 |
+
return save_path
|
128 |
+
else:
|
129 |
+
print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
|
130 |
+
return url
|
131 |
+
def get_paper_text(doi,id,manualLinks=None):
|
132 |
+
# create the temporary folder to contain the texts
|
133 |
+
folder_path = Path("data/"+str(id))
|
134 |
+
if not folder_path.exists():
|
135 |
+
cmd = f'mkdir data/{id}'
|
136 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
137 |
+
print("data/"+str(id) +" created.")
|
138 |
+
else:
|
139 |
+
print("data/"+str(id) +" already exists.")
|
140 |
+
saveLinkFolder = "data/"+id
|
141 |
+
|
142 |
+
link = 'https://doi.org/' + doi
|
143 |
+
'''textsToExtract = { "doiLink":"paperText"
|
144 |
+
"file1.pdf":"text1",
|
145 |
+
"file2.doc":"text2",
|
146 |
+
"file3.xlsx":excelText3'''
|
147 |
+
textsToExtract = {}
|
148 |
+
# get the file to create listOfFile for each id
|
149 |
+
html = extractHTML.HTML("",link)
|
150 |
+
jsonSM = html.getSupMaterial()
|
151 |
+
text = ""
|
152 |
+
links = [link] + sum((jsonSM[key] for key in jsonSM),[])
|
153 |
+
if manualLinks != None:
|
154 |
+
links += manualLinks
|
155 |
+
for l in links:
|
156 |
+
# get the main paper
|
157 |
+
name = l.split("/")[-1]
|
158 |
+
file_path = folder_path / name
|
159 |
+
if l == link:
|
160 |
+
text = html.getListSection()
|
161 |
+
textsToExtract[link] = text
|
162 |
+
elif l.endswith(".pdf"):
|
163 |
+
if file_path.is_file():
|
164 |
+
l = saveLinkFolder + "/" + name
|
165 |
+
print("File exists.")
|
166 |
+
p = pdf.PDF(l,saveLinkFolder,doi)
|
167 |
+
f = p.openPDFFile()
|
168 |
+
pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
|
169 |
+
doc = fitz.open(pdf_path)
|
170 |
+
text = "\n".join([page.get_text() for page in doc])
|
171 |
+
textsToExtract[l] = text
|
172 |
+
elif l.endswith(".doc") or l.endswith(".docx"):
|
173 |
+
d = wordDoc.wordDoc(l,saveLinkFolder)
|
174 |
+
text = d.extractTextByPage()
|
175 |
+
textsToExtract[l] = text
|
176 |
+
elif l.split(".")[-1].lower() in "xlsx":
|
177 |
+
wc = word2vec.word2Vec()
|
178 |
+
# download excel file if it not downloaded yet
|
179 |
+
savePath = saveLinkFolder +"/"+ l.split("/")[-1]
|
180 |
+
excelPath = download_excel_file(l, savePath)
|
181 |
+
corpus = wc.tableTransformToCorpusText([],excelPath)
|
182 |
+
text = ''
|
183 |
+
for c in corpus:
|
184 |
+
para = corpus[c]
|
185 |
+
for words in para:
|
186 |
+
text += " ".join(words)
|
187 |
+
textsToExtract[l] = text
|
188 |
+
# delete folder after finishing getting text
|
189 |
+
#cmd = f'rm -r data/{id}'
|
190 |
+
#result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
191 |
+
return textsToExtract
|
192 |
+
# Step 3.2: Extract context
|
193 |
+
def extract_context(text, keyword, window=500):
|
194 |
+
# firstly try accession number
|
195 |
+
idx = text.find(keyword)
|
196 |
+
if idx == -1:
|
197 |
+
return "Sample ID not found."
|
198 |
+
return text[max(0, idx-window): idx+window]
|
199 |
+
def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
|
200 |
+
if keep_if is None:
|
201 |
+
keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
|
202 |
+
|
203 |
+
outputs = ""
|
204 |
+
text = text.lower()
|
205 |
+
|
206 |
+
# If isolate is provided, prioritize paragraphs that mention it
|
207 |
+
# If isolate is provided, prioritize paragraphs that mention it
|
208 |
+
if accession and accession.lower() in text:
|
209 |
+
if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
|
210 |
+
outputs += extract_context(text, accession.lower(), window=700)
|
211 |
+
if isolate and isolate.lower() in text:
|
212 |
+
if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
|
213 |
+
outputs += extract_context(text, isolate.lower(), window=700)
|
214 |
+
for keyword in keep_if:
|
215 |
+
para = extract_context(text, keyword)
|
216 |
+
if para and para not in outputs:
|
217 |
+
outputs += para + "\n"
|
218 |
+
return outputs
|
219 |
+
# Step 4: Classification for now (demo purposes)
|
220 |
+
# 4.1: Using a HuggingFace model (question-answering)
|
221 |
+
def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
|
222 |
+
try:
|
223 |
+
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
224 |
+
result = qa({"context": context, "question": question})
|
225 |
+
return result.get("answer", "Unknown")
|
226 |
+
except Exception as e:
|
227 |
+
return f"Error: {str(e)}"
|
228 |
+
|
229 |
+
# 4.2: Infer from haplogroup
|
230 |
+
# Load pre-trained spaCy model for NER
|
231 |
+
try:
|
232 |
+
nlp = spacy.load("en_core_web_sm")
|
233 |
+
except OSError:
|
234 |
+
download("en_core_web_sm")
|
235 |
+
nlp = spacy.load("en_core_web_sm")
|
236 |
+
|
237 |
+
# Define the haplogroup-to-region mapping (simple rule-based)
|
238 |
+
import csv
|
239 |
+
|
240 |
+
def load_haplogroup_mapping(csv_path):
|
241 |
+
mapping = {}
|
242 |
+
with open(csv_path) as f:
|
243 |
+
reader = csv.DictReader(f)
|
244 |
+
for row in reader:
|
245 |
+
mapping[row["haplogroup"]] = [row["region"],row["source"]]
|
246 |
+
return mapping
|
247 |
+
|
248 |
+
# Function to extract haplogroup from the text
|
249 |
+
def extract_haplogroup(text):
|
250 |
+
match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
|
251 |
+
if match:
|
252 |
+
submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
|
253 |
+
if submatch:
|
254 |
+
return submatch.group(0)
|
255 |
+
else:
|
256 |
+
return match.group(1) # fallback
|
257 |
+
fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
|
258 |
+
if fallback:
|
259 |
+
return fallback.group(1)
|
260 |
+
return None
|
261 |
+
|
262 |
+
|
263 |
+
# Function to extract location based on NER
|
264 |
+
def extract_location(text):
|
265 |
+
doc = nlp(text)
|
266 |
+
locations = []
|
267 |
+
for ent in doc.ents:
|
268 |
+
if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
|
269 |
+
locations.append(ent.text)
|
270 |
+
return locations
|
271 |
+
|
272 |
+
# Function to infer location from haplogroup
|
273 |
+
def infer_location_from_haplogroup(haplogroup):
|
274 |
+
haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
|
275 |
+
return haplo_map.get(haplogroup, ["Unknown","Unknown"])
|
276 |
+
|
277 |
+
# Function to classify the mtDNA sample
|
278 |
+
def classify_mtDNA_sample_from_haplo(text):
|
279 |
+
# Extract haplogroup
|
280 |
+
haplogroup = extract_haplogroup(text)
|
281 |
+
# Extract location based on NER
|
282 |
+
locations = extract_location(text)
|
283 |
+
# Infer location based on haplogroup
|
284 |
+
inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
|
285 |
+
return {
|
286 |
+
"source":sourceHaplo,
|
287 |
+
"locations_found_in_context": locations,
|
288 |
+
"haplogroup": haplogroup,
|
289 |
+
"inferred_location": inferred_location
|
290 |
+
|
291 |
+
}
|
292 |
+
# 4.3 Get from available NCBI
|
293 |
+
def infer_location_fromNCBI(accession):
|
294 |
+
try:
|
295 |
+
handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
|
296 |
+
text = handle.read()
|
297 |
+
handle.close()
|
298 |
+
match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
|
299 |
+
if match:
|
300 |
+
return match.group(2), match.group(0) # This is the value like "Brunei"
|
301 |
+
return "Not found", "Not found"
|
302 |
+
|
303 |
+
except Exception as e:
|
304 |
+
print("❌ Entrez error:", e)
|
305 |
+
return "Not found", "Not found"
|
306 |
+
|
307 |
+
### ANCIENT/MODERN FLAG
|
308 |
+
from Bio import Entrez
|
309 |
+
import re
|
310 |
+
|
311 |
+
def flag_ancient_modern(accession, textsToExtract, isolate=None):
|
312 |
+
"""
|
313 |
+
Try to classify a sample as Ancient or Modern using:
|
314 |
+
1. NCBI accession (if available)
|
315 |
+
2. Supplementary text or context fallback
|
316 |
+
"""
|
317 |
+
context = ""
|
318 |
+
label, explain = "", ""
|
319 |
+
|
320 |
+
try:
|
321 |
+
# Check if we can fetch metadata from NCBI using the accession
|
322 |
+
handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
|
323 |
+
text = handle.read()
|
324 |
+
handle.close()
|
325 |
+
|
326 |
+
isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
|
327 |
+
if isolate_source:
|
328 |
+
context += isolate_source.group(0) + " "
|
329 |
+
|
330 |
+
specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
|
331 |
+
if specimen:
|
332 |
+
context += specimen.group(0) + " "
|
333 |
+
|
334 |
+
if context.strip():
|
335 |
+
label, explain = detect_ancient_flag(context)
|
336 |
+
if label!="Unknown":
|
337 |
+
return label, explain + " from NCBI\n(" + context + ")"
|
338 |
+
|
339 |
+
# If no useful NCBI metadata, check supplementary texts
|
340 |
+
if textsToExtract:
|
341 |
+
labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
|
342 |
+
|
343 |
+
for source in textsToExtract:
|
344 |
+
text_block = textsToExtract[source]
|
345 |
+
context = extract_relevant_paragraphs(text_block, accession, isolate=isolate) # Reduce to informative paragraph(s)
|
346 |
+
label, explain = detect_ancient_flag(context)
|
347 |
+
|
348 |
+
if label == "Ancient":
|
349 |
+
labels["ancient"][0] += 1
|
350 |
+
labels["ancient"][1] += f"{source}:\n{explain}\n\n"
|
351 |
+
elif label == "Modern":
|
352 |
+
labels["modern"][0] += 1
|
353 |
+
labels["modern"][1] += f"{source}:\n{explain}\n\n"
|
354 |
+
else:
|
355 |
+
labels["unknown"] += 1
|
356 |
+
|
357 |
+
if max(labels["modern"][0],labels["ancient"][0]) > 0:
|
358 |
+
if labels["modern"][0] > labels["ancient"][0]:
|
359 |
+
return "Modern", labels["modern"][1]
|
360 |
+
else:
|
361 |
+
return "Ancient", labels["ancient"][1]
|
362 |
+
else:
|
363 |
+
return "Unknown", "No strong keywords detected"
|
364 |
+
else:
|
365 |
+
print("No DOI or PubMed ID available for inference.")
|
366 |
+
return "", ""
|
367 |
+
|
368 |
+
except Exception as e:
|
369 |
+
print("Error:", e)
|
370 |
+
return "", ""
|
371 |
+
|
372 |
+
|
373 |
+
def detect_ancient_flag(context_snippet):
|
374 |
+
context = context_snippet.lower()
|
375 |
+
|
376 |
+
ancient_keywords = [
|
377 |
+
"ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
|
378 |
+
"bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
|
379 |
+
"postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
|
380 |
+
"archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
|
381 |
+
]
|
382 |
+
|
383 |
+
modern_keywords = [
|
384 |
+
"modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
|
385 |
+
"genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
|
386 |
+
"we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
|
387 |
+
"control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
|
388 |
+
"bioinformatic analysis", "samples from", "population genetics", "genome-wide data"
|
389 |
+
]
|
390 |
+
|
391 |
+
ancient_hits = [k for k in ancient_keywords if k in context]
|
392 |
+
modern_hits = [k for k in modern_keywords if k in context]
|
393 |
+
|
394 |
+
if ancient_hits and not modern_hits:
|
395 |
+
return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
|
396 |
+
elif modern_hits and not ancient_hits:
|
397 |
+
return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
|
398 |
+
elif ancient_hits and modern_hits:
|
399 |
+
if len(ancient_hits) >= len(modern_hits):
|
400 |
+
return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
|
401 |
+
else:
|
402 |
+
return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
|
403 |
+
|
404 |
+
# Fallback to QA
|
405 |
+
answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
|
406 |
+
if answer.startswith("Error"):
|
407 |
+
return "Unknown", answer
|
408 |
+
if "ancient" in answer.lower():
|
409 |
+
return "Ancient", f"Leaning ancient based on QA: {answer}"
|
410 |
+
elif "modern" in answer.lower():
|
411 |
+
return "Modern", f"Leaning modern based on QA: {answer}"
|
412 |
+
else:
|
413 |
+
return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
|
414 |
+
|
415 |
+
# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
|
416 |
+
def classify_sample_location(accession):
|
417 |
+
outputs = {}
|
418 |
+
keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
|
419 |
+
# Step 1: get pubmed id and isolate
|
420 |
+
pubmedID, isolate = get_info_from_accession(accession)
|
421 |
+
'''if not pubmedID:
|
422 |
+
return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
|
423 |
+
if not isolate:
|
424 |
+
isolate = "UNKNOWN_ISOLATE"
|
425 |
+
# Step 2: get doi
|
426 |
+
doi = get_doi_from_pubmed_id(pubmedID)
|
427 |
+
'''if not doi:
|
428 |
+
return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
|
429 |
+
# Step 3: get text
|
430 |
+
'''textsToExtract = { "doiLink":"paperText"
|
431 |
+
"file1.pdf":"text1",
|
432 |
+
"file2.doc":"text2",
|
433 |
+
"file3.xlsx":excelText3'''
|
434 |
+
if doi and pubmedID:
|
435 |
+
textsToExtract = get_paper_text(doi,pubmedID)
|
436 |
+
else: textsToExtract = {}
|
437 |
+
'''if not textsToExtract:
|
438 |
+
return {"error": f"No texts extracted for DOI {doi}"}'''
|
439 |
+
if isolate not in [None, "UNKNOWN_ISOLATE"]:
|
440 |
+
label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
|
441 |
+
else:
|
442 |
+
label, explain = flag_ancient_modern(accession,textsToExtract)
|
443 |
+
# Step 4: prediction
|
444 |
+
outputs[accession] = {}
|
445 |
+
outputs[isolate] = {}
|
446 |
+
# 4.0 Infer from NCBI
|
447 |
+
location, outputNCBI = infer_location_fromNCBI(accession)
|
448 |
+
NCBI_result = {
|
449 |
+
"source": "NCBI",
|
450 |
+
"sample_id": accession,
|
451 |
+
"predicted_location": location,
|
452 |
+
"context_snippet": outputNCBI}
|
453 |
+
outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
|
454 |
+
if textsToExtract:
|
455 |
+
long_text = ""
|
456 |
+
for key in textsToExtract:
|
457 |
+
text = textsToExtract[key]
|
458 |
+
# try accession number first
|
459 |
+
outputs[accession][key] = {}
|
460 |
+
keyword = accession
|
461 |
+
context = extract_context(text, keyword, window=500)
|
462 |
+
# 4.1: Using a HuggingFace model (question-answering)
|
463 |
+
location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
|
464 |
+
qa_result = {
|
465 |
+
"source": key,
|
466 |
+
"sample_id": keyword,
|
467 |
+
"predicted_location": location,
|
468 |
+
"context_snippet": context
|
469 |
+
}
|
470 |
+
outputs[keyword][key]["QAModel"] = qa_result
|
471 |
+
# 4.2: Infer from haplogroup
|
472 |
+
haplo_result = classify_mtDNA_sample_from_haplo(context)
|
473 |
+
outputs[keyword][key]["haplogroup"] = haplo_result
|
474 |
+
# try isolate
|
475 |
+
keyword = isolate
|
476 |
+
outputs[isolate][key] = {}
|
477 |
+
context = extract_context(text, keyword, window=500)
|
478 |
+
# 4.1.1: Using a HuggingFace model (question-answering)
|
479 |
+
location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
|
480 |
+
qa_result = {
|
481 |
+
"source": key,
|
482 |
+
"sample_id": keyword,
|
483 |
+
"predicted_location": location,
|
484 |
+
"context_snippet": context
|
485 |
+
}
|
486 |
+
outputs[keyword][key]["QAModel"] = qa_result
|
487 |
+
# 4.2.1: Infer from haplogroup
|
488 |
+
haplo_result = classify_mtDNA_sample_from_haplo(context)
|
489 |
+
outputs[keyword][key]["haplogroup"] = haplo_result
|
490 |
+
# add long text
|
491 |
+
long_text += text + ". \n"
|
492 |
+
# 4.3: UpgradeClassify
|
493 |
+
# try sample_id as accession number
|
494 |
+
sample_id = accession
|
495 |
+
if sample_id:
|
496 |
+
filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
|
497 |
+
locations = infer_location_for_sample(sample_id.upper(), filtered_context)
|
498 |
+
if locations!="No clear location found in top matches":
|
499 |
+
outputs[sample_id]["upgradeClassifier"] = {}
|
500 |
+
outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
|
501 |
+
"source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
|
502 |
+
"sample_id": sample_id,
|
503 |
+
"predicted_location": ", ".join(locations),
|
504 |
+
"context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
|
505 |
+
}
|
506 |
+
# try sample_id as isolate name
|
507 |
+
sample_id = isolate
|
508 |
+
if sample_id:
|
509 |
+
filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
|
510 |
+
locations = infer_location_for_sample(sample_id.upper(), filtered_context)
|
511 |
+
if locations!="No clear location found in top matches":
|
512 |
+
outputs[sample_id]["upgradeClassifier"] = {}
|
513 |
+
outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
|
514 |
+
"source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
|
515 |
+
"sample_id": sample_id,
|
516 |
+
"predicted_location": ", ".join(locations),
|
517 |
+
"context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
|
518 |
+
}
|
519 |
+
return outputs, label, explain
|
mtdna_ui.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from mtdna_backend import *
|
3 |
+
import json
|
4 |
+
# Gradio UI
|
5 |
+
with gr.Blocks() as interface:
|
6 |
+
gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
|
7 |
+
|
8 |
+
inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
|
9 |
+
|
10 |
+
with gr.Group() as single_input_group:
|
11 |
+
single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
|
12 |
+
|
13 |
+
with gr.Group(visible=False) as batch_input_group:
|
14 |
+
raw_text = gr.Textbox(label="🧬 Paste Accession Numbers (e.g., MF362736.1,MF362738.1,KU131308,MW291678)")
|
15 |
+
gr.HTML("""<a href="https://drive.google.com/file/d/1t-TFeIsGVu5Jh3CUZS-VE9jQWzNFCs_c/view?usp=sharing" download target="_blank">Download Example CSV Format</a>""")
|
16 |
+
gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing&ouid=112390323314156876153&rtpof=true&sd=true" download target="_blank">Download Example Excel Format</a>""")
|
17 |
+
file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
with gr.Row():
|
22 |
+
run_button = gr.Button("🔍 Submit and Classify")
|
23 |
+
reset_button = gr.Button("🔄 Reset")
|
24 |
+
|
25 |
+
status = gr.Markdown(visible=False)
|
26 |
+
|
27 |
+
with gr.Group(visible=False) as results_group:
|
28 |
+
with gr.Accordion("Open to See the Result", open=False) as results:
|
29 |
+
with gr.Row():
|
30 |
+
output_summary = gr.Markdown(elem_id="output-summary")
|
31 |
+
output_flag = gr.Markdown(elem_id="output-flag")
|
32 |
+
|
33 |
+
gr.Markdown("---")
|
34 |
+
|
35 |
+
with gr.Accordion("Open to See the Output Table", open=False) as table_accordion:
|
36 |
+
"""output_table = gr.Dataframe(
|
37 |
+
headers=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"],
|
38 |
+
interactive=False,
|
39 |
+
row_count=(5, "dynamic")
|
40 |
+
)"""
|
41 |
+
output_table = gr.HTML(render=True)
|
42 |
+
|
43 |
+
|
44 |
+
with gr.Row():
|
45 |
+
output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
|
46 |
+
download_button = gr.Button("⬇️ Download Output")
|
47 |
+
download_file = gr.File(label="Download File Here",visible=False)
|
48 |
+
|
49 |
+
gr.Markdown("---")
|
50 |
+
|
51 |
+
gr.Markdown("### 💬 Feedback (required)")
|
52 |
+
q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
|
53 |
+
q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
|
54 |
+
contact = gr.Textbox(label="📧 Your email or institution (optional)")
|
55 |
+
submit_feedback = gr.Button("✅ Submit Feedback")
|
56 |
+
feedback_status = gr.Markdown()
|
57 |
+
|
58 |
+
# Functions
|
59 |
+
|
60 |
+
def toggle_input_mode(mode):
|
61 |
+
if mode == "Single Accession":
|
62 |
+
return gr.update(visible=True), gr.update(visible=False)
|
63 |
+
else:
|
64 |
+
return gr.update(visible=False), gr.update(visible=True)
|
65 |
+
|
66 |
+
def classify_with_loading():
|
67 |
+
return gr.update(value="⏳ Please wait... processing...",visible=True) # Show processing message
|
68 |
+
|
69 |
+
def classify_dynamic(single_accession, file, text, mode):
|
70 |
+
if mode == "Single Accession":
|
71 |
+
return classify_main(single_accession) + (gr.update(visible=False),)
|
72 |
+
else:
|
73 |
+
#return summarize_batch(file, text) + (gr.update(visible=False),) # Hide processing message
|
74 |
+
return classify_mulAcc(file, text) + (gr.update(visible=False),) # Hide processing message
|
75 |
+
|
76 |
+
# for single accession
|
77 |
+
def classify_main(accession):
|
78 |
+
table, summary, labelAncient_Modern, explain_label = summarize_results(accession)
|
79 |
+
flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
|
80 |
+
return (
|
81 |
+
#table,
|
82 |
+
make_html_table(table),
|
83 |
+
summary,
|
84 |
+
flag_output,
|
85 |
+
gr.update(visible=True),
|
86 |
+
gr.update(visible=False)
|
87 |
+
)
|
88 |
+
# for batch accessions
|
89 |
+
def classify_mulAcc(file, text):
|
90 |
+
table, summary, flag_output, gr1, gr2 = summarize_batch(file, text)
|
91 |
+
#flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
|
92 |
+
return (
|
93 |
+
#table,
|
94 |
+
make_html_table(table),
|
95 |
+
summary,
|
96 |
+
flag_output,
|
97 |
+
gr.update(visible=True),
|
98 |
+
gr.update(visible=False)
|
99 |
+
)
|
100 |
+
|
101 |
+
def make_html_table(rows):
|
102 |
+
html = """
|
103 |
+
<div style='overflow-x: auto; padding: 10px;'>
|
104 |
+
<div style='max-height: 400px; overflow-y: auto; border: 1px solid #444; border-radius: 8px;'>
|
105 |
+
<table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: #f1f1f1; background-color: #1e1e1e;'>
|
106 |
+
<thead style='position: sticky; top: 0; background-color: #2c2c2c; z-index: 1;'>
|
107 |
+
<tr>
|
108 |
+
"""
|
109 |
+
headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
|
110 |
+
html += "".join(
|
111 |
+
f"<th style='padding: 10px; border: 1px solid #555; text-align: left; white-space: nowrap;'>{h}</th>"
|
112 |
+
for h in headers
|
113 |
+
)
|
114 |
+
html += "</tr></thead><tbody>"
|
115 |
+
|
116 |
+
for row in rows:
|
117 |
+
html += "<tr>"
|
118 |
+
for i, col in enumerate(row):
|
119 |
+
header = headers[i]
|
120 |
+
style = "padding: 10px; border: 1px solid #555; vertical-align: top;"
|
121 |
+
|
122 |
+
# For specific columns like Haplogroup, force nowrap
|
123 |
+
if header in ["Haplogroup", "Sample ID", "Technique"]:
|
124 |
+
style += " white-space: nowrap; text-overflow: ellipsis; max-width: 200px; overflow: hidden;"
|
125 |
+
|
126 |
+
if header == "Source" and isinstance(col, str) and col.strip().lower().startswith("http"):
|
127 |
+
col = f"<a href='{col}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{col}</a>"
|
128 |
+
|
129 |
+
html += f"<td style='{style}'>{col}</td>"
|
130 |
+
html += "</tr>"
|
131 |
+
|
132 |
+
html += "</tbody></table></div></div>"
|
133 |
+
return html
|
134 |
+
|
135 |
+
|
136 |
+
def reset_fields():
|
137 |
+
return (
|
138 |
+
gr.update(value=""), # single_accession
|
139 |
+
gr.update(value=""), # raw_text
|
140 |
+
gr.update(value=None), # file_upload
|
141 |
+
gr.update(value="Single Accession"), # inputMode
|
142 |
+
gr.update(value=[], visible=True), # output_table
|
143 |
+
gr.update(value="", visible=True), # output_summary
|
144 |
+
gr.update(value="", visible=True), # output_flag
|
145 |
+
gr.update(visible=False), # status
|
146 |
+
gr.update(visible=False) # results_group
|
147 |
+
)
|
148 |
+
|
149 |
+
inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
|
150 |
+
run_button.click(fn=classify_with_loading, inputs=[], outputs=[status])
|
151 |
+
run_button.click(
|
152 |
+
fn=classify_dynamic,
|
153 |
+
inputs=[single_accession, file_upload, raw_text, inputMode],
|
154 |
+
outputs=[output_table, output_summary, output_flag, results_group, status]
|
155 |
+
)
|
156 |
+
reset_button.click(
|
157 |
+
fn=reset_fields,
|
158 |
+
inputs=[],
|
159 |
+
outputs=[
|
160 |
+
single_accession, raw_text, file_upload, inputMode,
|
161 |
+
output_table, output_summary, output_flag,
|
162 |
+
status, results_group
|
163 |
+
]
|
164 |
+
)
|
165 |
+
|
166 |
+
download_button.click(
|
167 |
+
fn=save_batch_output,
|
168 |
+
inputs=[output_table, output_summary, output_flag, output_type],
|
169 |
+
outputs=[download_file])
|
170 |
+
|
171 |
+
submit_feedback.click(
|
172 |
+
fn=store_feedback_to_google_sheets, inputs=[single_accession, q1, q2, contact], outputs=feedback_status
|
173 |
+
)
|
174 |
+
# Custom CSS styles
|
175 |
+
gr.HTML("""
|
176 |
+
<style>
|
177 |
+
/* Ensures both sections are equally spaced with the same background size */
|
178 |
+
#output-summary, #output-flag {
|
179 |
+
background-color: #f0f4f8; /* Light Grey for both */
|
180 |
+
padding: 20px;
|
181 |
+
border-radius: 10px;
|
182 |
+
margin-top: 10px;
|
183 |
+
width: 100%; /* Ensure full width */
|
184 |
+
min-height: 150px; /* Ensures both have a minimum height */
|
185 |
+
box-sizing: border-box; /* Prevents padding from increasing size */
|
186 |
+
display: flex;
|
187 |
+
flex-direction: column;
|
188 |
+
justify-content: space-between;
|
189 |
+
}
|
190 |
+
|
191 |
+
/* Specific background colors */
|
192 |
+
#output-summary {
|
193 |
+
background-color: #434a4b;
|
194 |
+
}
|
195 |
+
|
196 |
+
#output-flag {
|
197 |
+
background-color: #141616;
|
198 |
+
}
|
199 |
+
|
200 |
+
/* Ensuring they are in a row and evenly spaced */
|
201 |
+
.gradio-row {
|
202 |
+
display: flex;
|
203 |
+
justify-content: space-between;
|
204 |
+
width: 100%;
|
205 |
+
}
|
206 |
+
</style>
|
207 |
+
""")
|
208 |
+
|
209 |
+
|
210 |
+
interface.launch(share=True,debug=True)
|
output.json
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Detailed_Results": [
|
3 |
+
[
|
4 |
+
"MF362736.1 (accession number)",
|
5 |
+
"NCBI",
|
6 |
+
"NCBI",
|
7 |
+
"Armenia",
|
8 |
+
"",
|
9 |
+
"",
|
10 |
+
"/geo_loc_name=\"Armenia\""
|
11 |
+
],
|
12 |
+
[
|
13 |
+
"MF362736.1 (accession number)",
|
14 |
+
"QAModel",
|
15 |
+
"<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
|
16 |
+
"Sample ID not found",
|
17 |
+
"",
|
18 |
+
"",
|
19 |
+
"Sample ID not found."
|
20 |
+
],
|
21 |
+
[
|
22 |
+
"MF362736.1 (accession number)",
|
23 |
+
"haplogroup",
|
24 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
25 |
+
"",
|
26 |
+
"Sample",
|
27 |
+
"Unknown",
|
28 |
+
""
|
29 |
+
],
|
30 |
+
[
|
31 |
+
"rise396_mt (isolate of accession)",
|
32 |
+
"QAModel",
|
33 |
+
"<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
|
34 |
+
"Sample ID not found",
|
35 |
+
"",
|
36 |
+
"",
|
37 |
+
"Sample ID not found."
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"rise396_mt (isolate of accession)",
|
41 |
+
"haplogroup",
|
42 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
43 |
+
"",
|
44 |
+
"Sample",
|
45 |
+
"Unknown",
|
46 |
+
""
|
47 |
+
],
|
48 |
+
[
|
49 |
+
"MF362738.1 (accession number)",
|
50 |
+
"NCBI",
|
51 |
+
"NCBI",
|
52 |
+
"Armenia",
|
53 |
+
"",
|
54 |
+
"",
|
55 |
+
"/geo_loc_name=\"Armenia\""
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"MF362738.1 (accession number)",
|
59 |
+
"QAModel",
|
60 |
+
"<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
|
61 |
+
"Sample ID not found",
|
62 |
+
"",
|
63 |
+
"",
|
64 |
+
"Sample ID not found."
|
65 |
+
],
|
66 |
+
[
|
67 |
+
"MF362738.1 (accession number)",
|
68 |
+
"haplogroup",
|
69 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
70 |
+
"",
|
71 |
+
"Sample",
|
72 |
+
"Unknown",
|
73 |
+
""
|
74 |
+
],
|
75 |
+
[
|
76 |
+
"rise407_mt (isolate of accession)",
|
77 |
+
"QAModel",
|
78 |
+
"<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
|
79 |
+
"Sample ID not found",
|
80 |
+
"",
|
81 |
+
"",
|
82 |
+
"Sample ID not found."
|
83 |
+
],
|
84 |
+
[
|
85 |
+
"rise407_mt (isolate of accession)",
|
86 |
+
"haplogroup",
|
87 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
88 |
+
"",
|
89 |
+
"Sample",
|
90 |
+
"Unknown",
|
91 |
+
""
|
92 |
+
],
|
93 |
+
[
|
94 |
+
"MF362739.1 (accession number)",
|
95 |
+
"NCBI",
|
96 |
+
"NCBI",
|
97 |
+
"Armenia",
|
98 |
+
"",
|
99 |
+
"",
|
100 |
+
"/geo_loc_name=\"Armenia\""
|
101 |
+
],
|
102 |
+
[
|
103 |
+
"MF362739.1 (accession number)",
|
104 |
+
"QAModel",
|
105 |
+
"<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
|
106 |
+
"Sample ID not found",
|
107 |
+
"",
|
108 |
+
"",
|
109 |
+
"Sample ID not found."
|
110 |
+
],
|
111 |
+
[
|
112 |
+
"MF362739.1 (accession number)",
|
113 |
+
"haplogroup",
|
114 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
115 |
+
"",
|
116 |
+
"Sample",
|
117 |
+
"Unknown",
|
118 |
+
""
|
119 |
+
],
|
120 |
+
[
|
121 |
+
"rise408_mt (isolate of accession)",
|
122 |
+
"QAModel",
|
123 |
+
"<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
|
124 |
+
"Sample ID not found",
|
125 |
+
"",
|
126 |
+
"",
|
127 |
+
"Sample ID not found."
|
128 |
+
],
|
129 |
+
[
|
130 |
+
"rise408_mt (isolate of accession)",
|
131 |
+
"haplogroup",
|
132 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
133 |
+
"",
|
134 |
+
"Sample",
|
135 |
+
"Unknown",
|
136 |
+
""
|
137 |
+
],
|
138 |
+
[
|
139 |
+
"KU131308 (accession number)",
|
140 |
+
"NCBI",
|
141 |
+
"NCBI",
|
142 |
+
"Brunei",
|
143 |
+
"",
|
144 |
+
"",
|
145 |
+
"/geo_loc_name=\"Brunei\""
|
146 |
+
],
|
147 |
+
[
|
148 |
+
"KU131308 (accession number)",
|
149 |
+
"QAModel",
|
150 |
+
"<a href=\"https://doi.org/10.1007/s00439-015-1620-z\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1007/s00439-015-1620-z</a>",
|
151 |
+
"GenBank",
|
152 |
+
"",
|
153 |
+
"",
|
154 |
+
"t (unavailable at the start of this study). We performed whole-mtDNA sequencing as previously described (Torroni et al. 2001) using an ABI 48-capillary 3730 DNA Analyser (Taipei) an ABI 16-capillary 3130XL DNA Analyser (Leeds) and an ABI 16-capillary 3100 DNA Analyser (Porto). Details on the new and"
|
155 |
+
],
|
156 |
+
[
|
157 |
+
"KU131308 (accession number)",
|
158 |
+
"haplogroup",
|
159 |
+
"The region of haplogroup is inferred\nby using this source: EMPOP",
|
160 |
+
"",
|
161 |
+
"M7",
|
162 |
+
"East Asia",
|
163 |
+
""
|
164 |
+
],
|
165 |
+
[
|
166 |
+
"KU131308 (accession number)",
|
167 |
+
"QAModel",
|
168 |
+
"<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>",
|
169 |
+
"Sample ID not found",
|
170 |
+
"",
|
171 |
+
"",
|
172 |
+
"Sample ID not found."
|
173 |
+
],
|
174 |
+
[
|
175 |
+
"KU131308 (accession number)",
|
176 |
+
"haplogroup",
|
177 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
178 |
+
"",
|
179 |
+
"Sample",
|
180 |
+
"Unknown",
|
181 |
+
""
|
182 |
+
],
|
183 |
+
[
|
184 |
+
"KU131308 (accession number)",
|
185 |
+
"QAModel",
|
186 |
+
"<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>",
|
187 |
+
"Sample ID not found",
|
188 |
+
"",
|
189 |
+
"",
|
190 |
+
"Sample ID not found."
|
191 |
+
],
|
192 |
+
[
|
193 |
+
"KU131308 (accession number)",
|
194 |
+
"haplogroup",
|
195 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
196 |
+
"",
|
197 |
+
"Sample",
|
198 |
+
"Unknown",
|
199 |
+
""
|
200 |
+
],
|
201 |
+
[
|
202 |
+
"BRU18 (isolate of accession)",
|
203 |
+
"QAModel",
|
204 |
+
"<a href=\"https://doi.org/10.1007/s00439-015-1620-z\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1007/s00439-015-1620-z</a>",
|
205 |
+
"Sample ID not found",
|
206 |
+
"",
|
207 |
+
"",
|
208 |
+
"Sample ID not found."
|
209 |
+
],
|
210 |
+
[
|
211 |
+
"BRU18 (isolate of accession)",
|
212 |
+
"haplogroup",
|
213 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
214 |
+
"",
|
215 |
+
"Sample",
|
216 |
+
"Unknown",
|
217 |
+
""
|
218 |
+
],
|
219 |
+
[
|
220 |
+
"BRU18 (isolate of accession)",
|
221 |
+
"QAModel",
|
222 |
+
"<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>",
|
223 |
+
"Borneo",
|
224 |
+
"",
|
225 |
+
"",
|
226 |
+
", NA18138, NA18149, NA18152, \nNA18674, NA18707 \n Chinese in Denver, \nUSA \n[86] \nNA17971, NA18124, NA18550, NA18574, NA18582, \nNA18618, NA18636, NA18638, NA18639, NA18644, \nNA18756, NA18769, NA18771 \nHan Chinese in Beijing \n[86] \nNA18755 \nBeijing Han Chinese \n[86] \nNA18940, NA18943, NA18952, NA18953"
|
227 |
+
],
|
228 |
+
[
|
229 |
+
"BRU18 (isolate of accession)",
|
230 |
+
"haplogroup",
|
231 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
232 |
+
"",
|
233 |
+
"Denver",
|
234 |
+
"Unknown",
|
235 |
+
""
|
236 |
+
],
|
237 |
+
[
|
238 |
+
"BRU18 (isolate of accession)",
|
239 |
+
"QAModel",
|
240 |
+
"<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>",
|
241 |
+
"Sample ID not found",
|
242 |
+
"",
|
243 |
+
"",
|
244 |
+
"Sample ID not found."
|
245 |
+
],
|
246 |
+
[
|
247 |
+
"BRU18 (isolate of accession)",
|
248 |
+
"haplogroup",
|
249 |
+
"The region of haplogroup is inferred\nby using this source: Unknown",
|
250 |
+
"",
|
251 |
+
"Sample",
|
252 |
+
"Unknown",
|
253 |
+
""
|
254 |
+
],
|
255 |
+
[
|
256 |
+
"MW291678 (accession number)",
|
257 |
+
"NCBI",
|
258 |
+
"NCBI",
|
259 |
+
"Argentina",
|
260 |
+
"",
|
261 |
+
"",
|
262 |
+
"/geo_loc_name=\"Argentina\""
|
263 |
+
],
|
264 |
+
[
|
265 |
+
"MN006856 (accession number)",
|
266 |
+
"NCBI",
|
267 |
+
"NCBI",
|
268 |
+
"Not found",
|
269 |
+
"",
|
270 |
+
"",
|
271 |
+
"Not found"
|
272 |
+
]
|
273 |
+
],
|
274 |
+
"Summary_Text": "**MF362736.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**MF362738.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**MF362739.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**KU131308**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Brunei**: 1 times\n- **GenBank**: 1 times\n- **Borneo**: 1 times\n- **East Asia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Brunei** (mentioned 1 times)\n\n---\n\n**MW291678**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Argentina**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Argentina** (mentioned 1 times)\n\n---\n\n**MN006856**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Not found**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Not found** (mentioned 1 times)",
|
275 |
+
"Ancient_Modern_Flag": "**MF362736.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ Flagged as ancient due to keywords: tomb, skeleton from NCBI\n(/isolation_source=\"Tomb 6; skeleton 1\" /specimen_voucher=\"Kapan;Tomb 6; skeleton 1\" )\n\n---\n\n**MF362738.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:\nMixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site\n\n\n\n---\n\n**MF362739.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:\nMixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site\n\n\n\n---\n\n**KU131308**\n### \ud83c\udffa Ancient/Modern Flag\n**Modern**\n\n_Explanation:_ https://doi.org/10.1007/s00439-015-1620-z:\nMixed context, leaning modern due to: we analysed, new sequences, published data, sink population, genome-wide data\n\n\n\n---\n\n**MW291678**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ Flagged as ancient due to keywords: archaeological from NCBI\n(/isolation_source=\"archaeological human bone\" )\n\n---\n\n**MN006856**\n### \ud83c\udffa Ancient/Modern Flag\n****\n\n_Explanation:_ "
|
276 |
+
}None
|
output.txt
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
=== Detailed Results ===
|
2 |
+
MF362736.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
|
3 |
+
MF362736.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
|
4 |
+
MF362736.1 (accession number), haplogroup, The region of haplogroup is inferred
|
5 |
+
by using this source: Unknown, Sample, Unknown
|
6 |
+
rise396_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
|
7 |
+
rise396_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
|
8 |
+
by using this source: Unknown, Sample, Unknown
|
9 |
+
MF362738.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
|
10 |
+
MF362738.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
|
11 |
+
MF362738.1 (accession number), haplogroup, The region of haplogroup is inferred
|
12 |
+
by using this source: Unknown, Sample, Unknown
|
13 |
+
rise407_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
|
14 |
+
rise407_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
|
15 |
+
by using this source: Unknown, Sample, Unknown
|
16 |
+
MF362739.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
|
17 |
+
MF362739.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
|
18 |
+
MF362739.1 (accession number), haplogroup, The region of haplogroup is inferred
|
19 |
+
by using this source: Unknown, Sample, Unknown
|
20 |
+
rise408_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
|
21 |
+
rise408_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
|
22 |
+
by using this source: Unknown, Sample, Unknown
|
23 |
+
KU131308 (accession number), NCBI, NCBI, Brunei, /geo_loc_name="Brunei"
|
24 |
+
KU131308 (accession number), QAModel, <a href="https://doi.org/10.1007/s00439-015-1620-z" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1007/s00439-015-1620-z</a>, GenBank, t (unavailable at the start of this study). We performed whole-mtDNA sequencing as previously described (Torroni et al. 2001) using an ABI 48-capillary 3730 DNA Analyser (Taipei) an ABI 16-capillary 3130XL DNA Analyser (Leeds) and an ABI 16-capillary 3100 DNA Analyser (Porto). Details on the new and
|
25 |
+
KU131308 (accession number), haplogroup, The region of haplogroup is inferred
|
26 |
+
by using this source: EMPOP, M7, East Asia
|
27 |
+
KU131308 (accession number), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>, Sample ID not found, Sample ID not found.
|
28 |
+
KU131308 (accession number), haplogroup, The region of haplogroup is inferred
|
29 |
+
by using this source: Unknown, Sample, Unknown
|
30 |
+
KU131308 (accession number), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>, Sample ID not found, Sample ID not found.
|
31 |
+
KU131308 (accession number), haplogroup, The region of haplogroup is inferred
|
32 |
+
by using this source: Unknown, Sample, Unknown
|
33 |
+
BRU18 (isolate of accession), QAModel, <a href="https://doi.org/10.1007/s00439-015-1620-z" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1007/s00439-015-1620-z</a>, Sample ID not found, Sample ID not found.
|
34 |
+
BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
|
35 |
+
by using this source: Unknown, Sample, Unknown
|
36 |
+
BRU18 (isolate of accession), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>, Borneo, , NA18138, NA18149, NA18152,
|
37 |
+
NA18674, NA18707
|
38 |
+
Chinese in Denver,
|
39 |
+
USA
|
40 |
+
[86]
|
41 |
+
NA17971, NA18124, NA18550, NA18574, NA18582,
|
42 |
+
NA18618, NA18636, NA18638, NA18639, NA18644,
|
43 |
+
NA18756, NA18769, NA18771
|
44 |
+
Han Chinese in Beijing
|
45 |
+
[86]
|
46 |
+
NA18755
|
47 |
+
Beijing Han Chinese
|
48 |
+
[86]
|
49 |
+
NA18940, NA18943, NA18952, NA18953
|
50 |
+
BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
|
51 |
+
by using this source: Unknown, Denver, Unknown
|
52 |
+
BRU18 (isolate of accession), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>, Sample ID not found, Sample ID not found.
|
53 |
+
BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
|
54 |
+
by using this source: Unknown, Sample, Unknown
|
55 |
+
MW291678 (accession number), NCBI, NCBI, Argentina, /geo_loc_name="Argentina"
|
56 |
+
MN006856 (accession number), NCBI, NCBI, Not found, Not found
|
57 |
+
|
58 |
+
=== Summary ===
|
59 |
+
**MF362736.1**
|
60 |
+
### 🧭 Location Frequency Summary
|
61 |
+
After counting all predicted and inferred locations:
|
62 |
+
|
63 |
+
- **Armenia**: 1 times
|
64 |
+
|
65 |
+
**Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
|
66 |
+
|
67 |
+
---
|
68 |
+
|
69 |
+
**MF362738.1**
|
70 |
+
### 🧭 Location Frequency Summary
|
71 |
+
After counting all predicted and inferred locations:
|
72 |
+
|
73 |
+
- **Armenia**: 1 times
|
74 |
+
|
75 |
+
**Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
|
76 |
+
|
77 |
+
---
|
78 |
+
|
79 |
+
**MF362739.1**
|
80 |
+
### 🧭 Location Frequency Summary
|
81 |
+
After counting all predicted and inferred locations:
|
82 |
+
|
83 |
+
- **Armenia**: 1 times
|
84 |
+
|
85 |
+
**Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
|
86 |
+
|
87 |
+
---
|
88 |
+
|
89 |
+
**KU131308**
|
90 |
+
### 🧭 Location Frequency Summary
|
91 |
+
After counting all predicted and inferred locations:
|
92 |
+
|
93 |
+
- **Brunei**: 1 times
|
94 |
+
- **GenBank**: 1 times
|
95 |
+
- **Borneo**: 1 times
|
96 |
+
- **East Asia**: 1 times
|
97 |
+
|
98 |
+
**Final Suggested Location:** 🗺️ **Brunei** (mentioned 1 times)
|
99 |
+
|
100 |
+
---
|
101 |
+
|
102 |
+
**MW291678**
|
103 |
+
### 🧭 Location Frequency Summary
|
104 |
+
After counting all predicted and inferred locations:
|
105 |
+
|
106 |
+
- **Argentina**: 1 times
|
107 |
+
|
108 |
+
**Final Suggested Location:** 🗺️ **Argentina** (mentioned 1 times)
|
109 |
+
|
110 |
+
---
|
111 |
+
|
112 |
+
**MN006856**
|
113 |
+
### 🧭 Location Frequency Summary
|
114 |
+
After counting all predicted and inferred locations:
|
115 |
+
|
116 |
+
- **Not found**: 1 times
|
117 |
+
|
118 |
+
**Final Suggested Location:** 🗺️ **Not found** (mentioned 1 times)
|
119 |
+
|
120 |
+
=== Ancient/Modern Flag ===
|
121 |
+
**MF362736.1**
|
122 |
+
### 🏺 Ancient/Modern Flag
|
123 |
+
**Ancient**
|
124 |
+
|
125 |
+
_Explanation:_ Flagged as ancient due to keywords: tomb, skeleton from NCBI
|
126 |
+
(/isolation_source="Tomb 6; skeleton 1" /specimen_voucher="Kapan;Tomb 6; skeleton 1" )
|
127 |
+
|
128 |
+
---
|
129 |
+
|
130 |
+
**MF362738.1**
|
131 |
+
### 🏺 Ancient/Modern Flag
|
132 |
+
**Ancient**
|
133 |
+
|
134 |
+
_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:
|
135 |
+
Mixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
---
|
140 |
+
|
141 |
+
**MF362739.1**
|
142 |
+
### 🏺 Ancient/Modern Flag
|
143 |
+
**Ancient**
|
144 |
+
|
145 |
+
_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:
|
146 |
+
Mixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
---
|
151 |
+
|
152 |
+
**KU131308**
|
153 |
+
### 🏺 Ancient/Modern Flag
|
154 |
+
**Modern**
|
155 |
+
|
156 |
+
_Explanation:_ https://doi.org/10.1007/s00439-015-1620-z:
|
157 |
+
Mixed context, leaning modern due to: we analysed, new sequences, published data, sink population, genome-wide data
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
---
|
162 |
+
|
163 |
+
**MW291678**
|
164 |
+
### 🏺 Ancient/Modern Flag
|
165 |
+
**Ancient**
|
166 |
+
|
167 |
+
_Explanation:_ Flagged as ancient due to keywords: archaeological from NCBI
|
168 |
+
(/isolation_source="archaeological human bone" )
|
169 |
+
|
170 |
+
---
|
171 |
+
|
172 |
+
**MN006856**
|
173 |
+
### 🏺 Ancient/Modern Flag
|
174 |
+
****
|
175 |
+
|
176 |
+
_Explanation:_
|
requirements.txt
CHANGED
@@ -1,24 +1,29 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
biopython==1.85
|
2 |
+
bs4==0.0.2
|
3 |
+
gensim==4.3.3
|
4 |
+
gradio==5.29.0
|
5 |
+
gspread==6.2.0
|
6 |
+
gspread-dataframe==4.0.0
|
7 |
+
huggingface-hub==0.30.2
|
8 |
+
nltk==3.9.1
|
9 |
+
oauth2client==4.1.3
|
10 |
+
openai==1.76.2
|
11 |
+
openpyxl==3.1.5
|
12 |
+
pandas==2.2.2
|
13 |
+
pdfreader==0.1.15
|
14 |
+
PyMuPDF==1.25.5
|
15 |
+
pytest==8.3.5
|
16 |
+
requests==2.32.3
|
17 |
+
scikit-learn==1.6.1
|
18 |
+
scipy==1.13.1
|
19 |
+
spacy==3.8.5
|
20 |
+
spacy-lookups-data==1.0.5
|
21 |
+
spire-doc==13.4.6
|
22 |
+
Spire.Xls==14.12.0
|
23 |
+
statsmodels==0.14.4
|
24 |
+
tabula-py==2.10.0
|
25 |
+
thefuzz==0.22.1
|
26 |
+
torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl
|
27 |
+
transformers==4.51.3
|
28 |
+
wordsegment==1.3.1
|
29 |
+
xlrd==2.0.1
|
setup.sh
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
|
3 |
-
# Install EDirect tools and set up PATH
|
4 |
-
yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
|
5 |
-
echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
|
6 |
-
export PATH=$HOME/edirect:$PATH
|
7 |
-
|
8 |
-
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Install EDirect tools and set up PATH
|
4 |
+
yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
|
5 |
+
echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
|
6 |
+
export PATH=$HOME/edirect:$PATH
|
7 |
+
|
8 |
+
|
standardize_location.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import re
|
3 |
+
|
4 |
+
# Normalize input
|
5 |
+
def normalize_key(text):
|
6 |
+
return re.sub(r"[^a-z0-9]", "", text.strip().lower())
|
7 |
+
|
8 |
+
# Search for city/place (normal flow)
|
9 |
+
def get_country_from_geonames(city_name, username="vyphung"):
|
10 |
+
url = "http://api.geonames.org/searchJSON"
|
11 |
+
params = {
|
12 |
+
"q": city_name,
|
13 |
+
"maxRows": 1,
|
14 |
+
"username": username
|
15 |
+
}
|
16 |
+
try:
|
17 |
+
r = requests.get(url, params=params, timeout=5)
|
18 |
+
data = r.json()
|
19 |
+
if data.get("geonames"):
|
20 |
+
return data["geonames"][0]["countryName"]
|
21 |
+
except Exception as e:
|
22 |
+
print("GeoNames searchJSON error:", e)
|
23 |
+
return None
|
24 |
+
|
25 |
+
# Search for country info using alpha-2/3 codes or name
|
26 |
+
def get_country_from_countryinfo(input_code, username="vyphung"):
|
27 |
+
url = "http://api.geonames.org/countryInfoJSON"
|
28 |
+
params = {
|
29 |
+
"username": username
|
30 |
+
}
|
31 |
+
try:
|
32 |
+
r = requests.get(url, params=params, timeout=5)
|
33 |
+
data = r.json()
|
34 |
+
if data.get("geonames"):
|
35 |
+
input_code = input_code.strip().upper()
|
36 |
+
for country in data["geonames"]:
|
37 |
+
# Match against country name, country code (alpha-2), iso alpha-3
|
38 |
+
if input_code in [
|
39 |
+
country.get("countryName", "").upper(),
|
40 |
+
country.get("countryCode", "").upper(),
|
41 |
+
country.get("isoAlpha3", "").upper()
|
42 |
+
]:
|
43 |
+
return country["countryName"]
|
44 |
+
except Exception as e:
|
45 |
+
print("GeoNames countryInfoJSON error:", e)
|
46 |
+
return None
|
47 |
+
|
48 |
+
# Combined smart lookup
|
49 |
+
def smart_country_lookup(user_input, username="vyphung"):
|
50 |
+
raw_input = user_input.strip()
|
51 |
+
normalized = re.sub(r"[^a-zA-Z0-9]", "", user_input).upper() # normalize for codes (no strip spaces!)
|
52 |
+
|
53 |
+
# Special case: if user writes "UK: London" → split and take main country part
|
54 |
+
if ":" in raw_input:
|
55 |
+
raw_input = raw_input.split(":")[0].strip() # only take "UK"
|
56 |
+
# First try as country code (if 2-3 letters or common abbreviation)
|
57 |
+
if len(normalized) <= 3:
|
58 |
+
if normalized.upper() in ["UK","U.K","U.K."]:
|
59 |
+
country = get_country_from_geonames(normalized.upper(), username=username)
|
60 |
+
if country:
|
61 |
+
return country
|
62 |
+
else:
|
63 |
+
country = get_country_from_countryinfo(raw_input, username=username)
|
64 |
+
if country:
|
65 |
+
return country
|
66 |
+
country = get_country_from_countryinfo(raw_input, username=username) # try full names
|
67 |
+
if country:
|
68 |
+
return country
|
69 |
+
# Otherwise, treat as city/place
|
70 |
+
country = get_country_from_geonames(raw_input, username=username)
|
71 |
+
if country:
|
72 |
+
return country
|
73 |
+
|
74 |
+
return "Not found"
|
upgradeClassify.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import spacy
|
3 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
4 |
+
import nltk
|
5 |
+
nltk.download('punkt_tab')
|
6 |
+
#import coreferee
|
7 |
+
import copy
|
8 |
+
from sentence_transformers import SentenceTransformer, util
|
9 |
+
from sklearn.cluster import DBSCAN
|
10 |
+
from sklearn.metrics.pairwise import cosine_distances
|
11 |
+
from collections import defaultdict
|
12 |
+
import numpy as np
|
13 |
+
#from mtdna_classifier import infer_fromQAModel
|
14 |
+
# 1. SENTENCE-BERT MODEL
|
15 |
+
# Step 1: Preprocess the text
|
16 |
+
def normalize_text(text):
|
17 |
+
# Normalize various separators to "-"
|
18 |
+
text = re.sub(r'\s*(–+|—+|--+>|–>|->|-->|to|→|➝|➔|➡)\s*', '-', text, flags=re.IGNORECASE)
|
19 |
+
# Fix GEN10GEN30 → GEN10-GEN30
|
20 |
+
text = re.sub(r'\b([a-zA-Z]+)(\d+)(\1)(\d+)\b', r'\1\2-\1\4', text)
|
21 |
+
# Fix GEN10-30 → GEN10-GEN30
|
22 |
+
text = re.sub(r'\b([a-zA-Z]+)(\d+)-(\d+)\b', r'\1\2-\1\3', text)
|
23 |
+
return text
|
24 |
+
|
25 |
+
def preprocess_text(text):
|
26 |
+
normalized = normalize_text(text)
|
27 |
+
sentences = sent_tokenize(normalized)
|
28 |
+
return [re.sub(r"[^a-zA-Z0-9\s\-]", "", s).strip() for s in sentences]
|
29 |
+
|
30 |
+
# Before step 2, check NLP cache to avoid calling it muliple times:
|
31 |
+
# Global model cache
|
32 |
+
_spacy_models = {}
|
33 |
+
|
34 |
+
def get_spacy_model(model_name, add_coreferee=False):
|
35 |
+
global _spacy_models
|
36 |
+
if model_name not in _spacy_models:
|
37 |
+
nlp = spacy.load(model_name)
|
38 |
+
if add_coreferee and "coreferee" not in nlp.pipe_names:
|
39 |
+
nlp.add_pipe("coreferee")
|
40 |
+
_spacy_models[model_name] = nlp
|
41 |
+
return _spacy_models[model_name]
|
42 |
+
|
43 |
+
# Step 2: NER to Extract Locations and Sample Names
|
44 |
+
def extract_entities(text, sample_id=None):
|
45 |
+
nlp = get_spacy_model("en_core_web_sm")
|
46 |
+
doc = nlp(text)
|
47 |
+
|
48 |
+
# Filter entities by GPE, but exclude things that match sample ID format
|
49 |
+
gpe_candidates = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
|
50 |
+
|
51 |
+
# Remove entries that match SAMPLE ID patterns like XXX123 or similar
|
52 |
+
gpe_filtered = [gpe for gpe in gpe_candidates if not re.fullmatch(r'[A-Z]{2,5}\d{2,4}', gpe.strip())]
|
53 |
+
|
54 |
+
# Optional: further filter known invalid patterns (e.g., things shorter than 3 chars, numeric only)
|
55 |
+
gpe_filtered = [gpe for gpe in gpe_filtered if len(gpe) > 2 and not gpe.strip().isdigit()]
|
56 |
+
|
57 |
+
if sample_id is None:
|
58 |
+
return list(set(gpe_filtered)), []
|
59 |
+
else:
|
60 |
+
sample_prefix = re.match(r'[A-Z]+', sample_id).group()
|
61 |
+
samples = re.findall(rf'{sample_prefix}\d+', text)
|
62 |
+
return list(set(gpe_filtered)), list(set(samples))
|
63 |
+
|
64 |
+
# Step 3: Build a Soft Matching Layer
|
65 |
+
# Handle patterns like "BRU1–BRU20" and identify BRU18 as part of it.
|
66 |
+
def is_sample_in_range(sample_id, sentence):
|
67 |
+
# Match prefix up to digits
|
68 |
+
sample_prefix_match = re.match(r'^([A-Z0-9]+?)(?=\d+$)', sample_id)
|
69 |
+
sample_number_match = re.search(r'(\d+)$', sample_id)
|
70 |
+
|
71 |
+
if not sample_prefix_match or not sample_number_match:
|
72 |
+
return False
|
73 |
+
|
74 |
+
sample_prefix = sample_prefix_match.group(1)
|
75 |
+
sample_number = int(sample_number_match.group(1))
|
76 |
+
sentence = normalize_text(sentence)
|
77 |
+
# Case 1: Full prefix on both sides
|
78 |
+
pattern1 = rf'{sample_prefix}(\d+)\s*-\s*{sample_prefix}(\d+)'
|
79 |
+
for match in re.findall(pattern1, sentence):
|
80 |
+
start, end = int(match[0]), int(match[1])
|
81 |
+
if start <= sample_number <= end:
|
82 |
+
return True
|
83 |
+
|
84 |
+
# Case 2: Prefix only on first number
|
85 |
+
pattern2 = rf'{sample_prefix}(\d+)\s*-\s*(\d+)'
|
86 |
+
for match in re.findall(pattern2, sentence):
|
87 |
+
start, end = int(match[0]), int(match[1])
|
88 |
+
if start <= sample_number <= end:
|
89 |
+
return True
|
90 |
+
|
91 |
+
return False
|
92 |
+
|
93 |
+
# Step 4: Use coreferree to merge the sentences have same coreference # still cannot cause packages conflict
|
94 |
+
# ========== HEURISTIC GROUP → LOCATION MAPPERS ==========
|
95 |
+
# === Generalized version to replace your old extract_sample_to_group_general ===
|
96 |
+
# === Generalized version to replace your old extract_group_to_location_general ===
|
97 |
+
def extract_population_locations(text):
|
98 |
+
text = normalize_text(text)
|
99 |
+
pattern = r'([A-Za-z ,\-]+)\n([A-Z]+\d*)\n([A-Za-z ,\-]+)\n([A-Za-z ,\-]+)'
|
100 |
+
pop_to_location = {}
|
101 |
+
|
102 |
+
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
|
103 |
+
_, pop_code, region, country = match.groups()
|
104 |
+
pop_to_location[pop_code.upper()] = f"{region.strip()}\n{country.strip()}"
|
105 |
+
|
106 |
+
return pop_to_location
|
107 |
+
|
108 |
+
def extract_sample_ranges(text):
|
109 |
+
text = normalize_text(text)
|
110 |
+
# Updated pattern to handle punctuation and line breaks
|
111 |
+
pattern = r'\b([A-Z0-9]+\d+)[–\-]([A-Z0-9]+\d+)[,:\.\s]*([A-Z0-9]+\d+)\b'
|
112 |
+
sample_to_pop = {}
|
113 |
+
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
|
114 |
+
start_id, end_id, pop_code = match.groups()
|
115 |
+
start_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', start_id, re.IGNORECASE).group(1).upper()
|
116 |
+
end_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', end_id, re.IGNORECASE).group(1).upper()
|
117 |
+
if start_prefix != end_prefix:
|
118 |
+
continue
|
119 |
+
start_num = int(re.search(r'(\d+)$', start_id).group())
|
120 |
+
end_num = int(re.search(r'(\d+)$', end_id).group())
|
121 |
+
for i in range(start_num, end_num + 1):
|
122 |
+
sample_id = f"{start_prefix}{i:03d}"
|
123 |
+
sample_to_pop[sample_id] = pop_code.upper()
|
124 |
+
|
125 |
+
return sample_to_pop
|
126 |
+
|
127 |
+
def filter_context_for_sample(sample_id, full_text, window_size=2):
|
128 |
+
|
129 |
+
# Normalize and tokenize
|
130 |
+
full_text = normalize_text(full_text)
|
131 |
+
sentences = sent_tokenize(full_text)
|
132 |
+
|
133 |
+
# Step 1: Find indices with direct mention or range match
|
134 |
+
match_indices = [
|
135 |
+
i for i, s in enumerate(sentences)
|
136 |
+
if sample_id in s or is_sample_in_range(sample_id, s)
|
137 |
+
]
|
138 |
+
|
139 |
+
# Step 2: Get sample → group mapping from full text
|
140 |
+
sample_to_group = extract_sample_ranges(full_text)
|
141 |
+
group_id = sample_to_group.get(sample_id)
|
142 |
+
|
143 |
+
# Step 3: Find group-related sentences
|
144 |
+
group_indices = []
|
145 |
+
if group_id:
|
146 |
+
for i, s in enumerate(sentences):
|
147 |
+
if group_id in s:
|
148 |
+
group_indices.append(i)
|
149 |
+
|
150 |
+
# Step 4: Collect sentences within window
|
151 |
+
selected_indices = set()
|
152 |
+
if len(match_indices + group_indices) > 0:
|
153 |
+
for i in match_indices + group_indices:
|
154 |
+
start = max(0, i - window_size)
|
155 |
+
end = min(len(sentences), i + window_size + 1)
|
156 |
+
selected_indices.update(range(start, end))
|
157 |
+
|
158 |
+
filtered_sentences = [sentences[i] for i in sorted(selected_indices)]
|
159 |
+
return " ".join(filtered_sentences)
|
160 |
+
return full_text
|
161 |
+
# Load the SpaCy transformer model with coreferee
|
162 |
+
def mergeCorefSen(text):
|
163 |
+
sen = preprocess_text(text)
|
164 |
+
return sen
|
165 |
+
|
166 |
+
# Before step 5 and below, let check transformer cache to avoid calling again
|
167 |
+
# Global SBERT model cache
|
168 |
+
_sbert_models = {}
|
169 |
+
|
170 |
+
def get_sbert_model(model_name="all-MiniLM-L6-v2"):
|
171 |
+
global _sbert_models
|
172 |
+
if model_name not in _sbert_models:
|
173 |
+
_sbert_models[model_name] = SentenceTransformer(model_name)
|
174 |
+
return _sbert_models[model_name]
|
175 |
+
|
176 |
+
# Step 5: Sentence-BERT retriever → Find top paragraphs related to keyword.
|
177 |
+
'''Use sentence transformers to embed the sentence that mentions the sample and
|
178 |
+
compare it to sentences that mention locations.'''
|
179 |
+
|
180 |
+
def find_top_para(sample_id, text,top_k=5):
|
181 |
+
sentences = mergeCorefSen(text)
|
182 |
+
model = get_sbert_model("all-mpnet-base-v2")
|
183 |
+
embeddings = model.encode(sentences, convert_to_tensor=True)
|
184 |
+
|
185 |
+
# Find the sentence that best matches the sample_id
|
186 |
+
sample_matches = [s for s in sentences if sample_id in s or is_sample_in_range(sample_id, s)]
|
187 |
+
if not sample_matches:
|
188 |
+
return [],"No context found for sample"
|
189 |
+
|
190 |
+
sample_embedding = model.encode(sample_matches[0], convert_to_tensor=True)
|
191 |
+
cos_scores = util.pytorch_cos_sim(sample_embedding, embeddings)[0]
|
192 |
+
|
193 |
+
# Get top-k most similar sentence indices
|
194 |
+
top_indices = cos_scores.argsort(descending=True)[:top_k]
|
195 |
+
return top_indices, sentences
|
196 |
+
|
197 |
+
# Step 6: DBSCAN to cluster the group of similar paragraphs.
|
198 |
+
def clusterPara(tokens):
|
199 |
+
# Load Sentence-BERT model
|
200 |
+
sbert_model = get_sbert_model("all-mpnet-base-v2")
|
201 |
+
sentence_embeddings = sbert_model.encode(tokens)
|
202 |
+
|
203 |
+
# Compute cosine distance matrix
|
204 |
+
distance_matrix = cosine_distances(sentence_embeddings)
|
205 |
+
|
206 |
+
# DBSCAN clustering
|
207 |
+
clustering_model = DBSCAN(eps=0.3, min_samples=1, metric="precomputed")
|
208 |
+
cluster_labels = clustering_model.fit_predict(distance_matrix)
|
209 |
+
|
210 |
+
# Group sentences by cluster
|
211 |
+
clusters = defaultdict(list)
|
212 |
+
cluster_embeddings = defaultdict(list)
|
213 |
+
sentence_to_cluster = {}
|
214 |
+
for i, label in enumerate(cluster_labels):
|
215 |
+
clusters[label].append(tokens[i])
|
216 |
+
cluster_embeddings[label].append(sentence_embeddings[i])
|
217 |
+
sentence_to_cluster[tokens[i]] = label
|
218 |
+
# Compute cluster centroids
|
219 |
+
centroids = {
|
220 |
+
label: np.mean(embs, axis=0)
|
221 |
+
for label, embs in cluster_embeddings.items()
|
222 |
+
}
|
223 |
+
return clusters, sentence_to_cluster, centroids
|
224 |
+
|
225 |
+
def rankSenFromCluster(clusters, sentence_to_cluster, centroids, target_sentence):
|
226 |
+
target_cluster = sentence_to_cluster[target_sentence]
|
227 |
+
target_centroid = centroids[target_cluster]
|
228 |
+
sen_rank = []
|
229 |
+
sen_order = list(sentence_to_cluster.keys())
|
230 |
+
# Compute distances to other cluster centroids
|
231 |
+
dists = []
|
232 |
+
for label, centroid in centroids.items():
|
233 |
+
dist = cosine_distances([target_centroid], [centroid])[0][0]
|
234 |
+
dists.append((label, dist))
|
235 |
+
dists.sort(key=lambda x: x[1]) # sort by proximity
|
236 |
+
for d in dists:
|
237 |
+
cluster = clusters[d[0]]
|
238 |
+
for sen in cluster:
|
239 |
+
if sen != target_sentence:
|
240 |
+
sen_rank.append(sen_order.index(sen))
|
241 |
+
return sen_rank
|
242 |
+
# Step 7: Final Inference Wrapper
|
243 |
+
def infer_location_for_sample(sample_id, context_text):
|
244 |
+
# Go through each of the top sentences in order
|
245 |
+
top_indices, sentences = find_top_para(sample_id, context_text,top_k=5)
|
246 |
+
if top_indices==[] or sentences == "No context found for sample":
|
247 |
+
return "No clear location found in top matches"
|
248 |
+
clusters, sentence_to_cluster, centroids = clusterPara(sentences)
|
249 |
+
topRankSen_DBSCAN = []
|
250 |
+
mostTopSen = ""
|
251 |
+
locations = ""
|
252 |
+
i = 0
|
253 |
+
while len(locations) == 0 or i < len(top_indices):
|
254 |
+
# Firstly, start with the top-ranked Sentence-BERT result
|
255 |
+
idx = top_indices[i]
|
256 |
+
best_sentence = sentences[idx]
|
257 |
+
if i == 0:
|
258 |
+
mostTopSen = best_sentence
|
259 |
+
locations, _ = extract_entities(best_sentence, sample_id)
|
260 |
+
if locations:
|
261 |
+
return locations
|
262 |
+
# If no location, then look for sample overlap in the same DBSCAN cluster
|
263 |
+
# Compute distances to other cluster centroids
|
264 |
+
if len(topRankSen_DBSCAN)==0 and mostTopSen:
|
265 |
+
topRankSen_DBSCAN = rankSenFromCluster(clusters, sentence_to_cluster, centroids, mostTopSen)
|
266 |
+
if i >= len(topRankSen_DBSCAN): break
|
267 |
+
idx_DBSCAN = topRankSen_DBSCAN[i]
|
268 |
+
best_sentence_DBSCAN = sentences[idx_DBSCAN]
|
269 |
+
locations, _ = extract_entities(best_sentence, sample_id)
|
270 |
+
if locations:
|
271 |
+
return locations
|
272 |
+
# If no, then backtrack to next best Sentence-BERT sentence (such as 2nd rank sentence), and repeat step 1 and 2 until run out
|
273 |
+
i += 1
|
274 |
+
# Last resort: LLM (e.g. chatGPT, deepseek, etc.)
|
275 |
+
#if len(locations) == 0:
|
276 |
+
return "No clear location found in top matches"
|