VyLala commited on
Commit
7fc87fe
·
verified ·
1 Parent(s): 0ac2f54

update new codes

Browse files
DefaultPackages/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/DefaultPackages/__pycache__/__init__.cpython-310.pyc and b/DefaultPackages/__pycache__/__init__.cpython-310.pyc differ
 
DefaultPackages/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/DefaultPackages/__pycache__/__init__.cpython-311.pyc and b/DefaultPackages/__pycache__/__init__.cpython-311.pyc differ
 
DefaultPackages/__pycache__/openFile.cpython-310.pyc CHANGED
Binary files a/DefaultPackages/__pycache__/openFile.cpython-310.pyc and b/DefaultPackages/__pycache__/openFile.cpython-310.pyc differ
 
DefaultPackages/__pycache__/openFile.cpython-311.pyc CHANGED
Binary files a/DefaultPackages/__pycache__/openFile.cpython-311.pyc and b/DefaultPackages/__pycache__/openFile.cpython-311.pyc differ
 
DefaultPackages/__pycache__/saveFile.cpython-310.pyc CHANGED
Binary files a/DefaultPackages/__pycache__/saveFile.cpython-310.pyc and b/DefaultPackages/__pycache__/saveFile.cpython-310.pyc differ
 
DefaultPackages/__pycache__/saveFile.cpython-311.pyc CHANGED
Binary files a/DefaultPackages/__pycache__/saveFile.cpython-311.pyc and b/DefaultPackages/__pycache__/saveFile.cpython-311.pyc differ
 
NER/PDF/__pycache__/pdf.cpython-310.pyc ADDED
Binary file (4.27 kB). View file
 
NER/PDF/__pycache__/pdf.cpython-311.pyc CHANGED
Binary files a/NER/PDF/__pycache__/pdf.cpython-311.pyc and b/NER/PDF/__pycache__/pdf.cpython-311.pyc differ
 
NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc ADDED
Binary file (3.76 kB). View file
 
NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc CHANGED
Binary files a/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc and b/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc differ
 
NER/__pycache__/cleanText.cpython-310.pyc ADDED
Binary file (3.42 kB). View file
 
NER/__pycache__/cleanText.cpython-311.pyc CHANGED
Binary files a/NER/__pycache__/cleanText.cpython-311.pyc and b/NER/__pycache__/cleanText.cpython-311.pyc differ
 
NER/html/__pycache__/extractHTML.cpython-310.pyc ADDED
Binary file (5.07 kB). View file
 
NER/html/__pycache__/extractHTML.cpython-311.pyc CHANGED
Binary files a/NER/html/__pycache__/extractHTML.cpython-311.pyc and b/NER/html/__pycache__/extractHTML.cpython-311.pyc differ
 
NER/word2Vec/__pycache__/word2vec.cpython-310.pyc ADDED
Binary file (7.81 kB). View file
 
NER/word2Vec/__pycache__/word2vec.cpython-311.pyc CHANGED
Binary files a/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc and b/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc differ
 
NER/word2Vec/heuristic.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from datetime import datetime
3
+
4
+ class HeuristicManager:
5
+ def __init__(self, model, log_file="heuristic_log.txt", min_similarity_threshold=0.5, min_new_data_len=50):
6
+ self.model = model
7
+ self.min_similarity_threshold = min_similarity_threshold
8
+ self.min_new_data_len = min_new_data_len
9
+ self.log_file = log_file
10
+ logging.basicConfig(filename=self.log_file, level=logging.INFO)
11
+
12
+ def log(self, message):
13
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
14
+ logging.info(f"[{timestamp}] {message}")
15
+ print(f"[{timestamp}] {message}")
16
+
17
+ def check_similarity(self, test_terms):
18
+ triggers = []
19
+ for term in test_terms:
20
+ try:
21
+ sim = self.model.wv.most_similar(term)[0][1]
22
+ if sim < self.min_similarity_threshold:
23
+ triggers.append(f"Low similarity for '{term}': {sim}")
24
+ except KeyError:
25
+ triggers.append(f"'{term}' not in vocabulary")
26
+ return triggers
27
+
28
+ def check_metadata(self, metadata):
29
+ triggers = []
30
+ if any(keyword in str(metadata).lower() for keyword in ["haplogroup b", "eastasia", "asian"]):
31
+ triggers.append("Detected new haplogroup or regional bias: 'Asian' or 'B'")
32
+ return triggers
33
+
34
+ def check_new_data_volume(self, new_data):
35
+ if len(new_data) < self.min_new_data_len:
36
+ return ["Not enough new data to justify retraining"]
37
+ return []
38
+
39
+ def should_retrain(self, test_terms, new_data, metadata):
40
+ triggers = []
41
+ triggers += self.check_similarity(test_terms)
42
+ triggers += self.check_metadata(metadata)
43
+ triggers += self.check_new_data_volume(new_data)
44
+
45
+ if triggers:
46
+ self.log("Retraining triggered due to:")
47
+ for trigger in triggers:
48
+ self.log(f" - {trigger}")
49
+ return True
50
+ else:
51
+ self.log("No retraining needed.")
52
+ return False
NER/word2Vec/testModel/test_model.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:734185116a1d2099dba0d04efc0eb1b7e0e8213fe1259b57bbcb7aaac3cd46ea
3
+ size 133
NER/word2Vec/testModel/test_model.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 24 100
2
+ dna -0.0005385255 0.0002430238 0.005111818 0.009016951 -0.009293036 -0.007109866 0.0064572324 0.008987154 -0.0050192317 -0.0037659889 0.0073785 -0.0015431087 -0.0045221853 0.006557529 -0.004854595 -0.0018278129 0.002881375 0.0010002495 -0.00829578 -0.009462763 0.007312361 0.0050688535 0.0067577288 0.0007685764 0.006347226 -0.003397316 -0.0009421973 0.0057741464 -0.007532499 -0.0039303782 -0.0075064874 -0.0009439946 0.009533595 -0.0073319245 -0.002333888 -0.0019326513 0.0080786925 -0.005930193 3.549824e-05 -0.00475331 -0.0095964745 0.005000012 -0.008770563 -0.0043735923 -2.9246534e-05 -0.00030931013 -0.007669701 0.009599569 0.004982613 0.009233704 -0.008148657 0.004488859 -0.0041414667 0.00081141765 0.008487031 -0.00446156 0.0045125154 -0.006793622 -0.0035560841 0.009394251 -0.0015774865 0.00032431752 -0.004129968 -0.0076763057 -0.0015165819 0.0024841889 -0.00088440755 0.0055526863 -0.0027446826 0.002259023 0.0054701897 0.008356409 -0.0014508999 -0.009201209 0.004375452 0.00058271736 0.0074576377 -0.00080706284 -0.0026372937 -0.008752899 -0.00087625836 0.00282087 0.005398569 0.0070530027 -0.0057170955 0.0018605916 0.006099475 -0.0048024287 -0.003104349 0.0067992285 0.0016360026 0.00019302641 0.00348545 0.00021818833 0.009630539 0.0050670514 -0.008908632 -0.007042295 0.0009007676 0.0063867364
3
+ from -0.00861988 0.0036778022 0.005193427 0.005744547 0.0074751326 -0.0061739217 0.0011082628 0.0060625207 -0.0028567386 -0.006184132 -0.00041290926 -0.008384168 -0.0055893976 0.007104685 0.003362318 0.007228353 0.0068033817 0.007533677 -0.003792071 -0.000581891 0.0023577819 -0.0045196284 0.008395244 -0.009858517 0.006761404 0.0029261683 -0.004930935 0.0043925527 -0.0017370671 0.006713542 0.009974645 -0.0043735756 -0.0006050642 -0.005716478 0.003858548 0.002799571 0.00690247 0.00610934 0.009526547 0.009269763 0.007910428 -0.007008808 -0.00916451 -0.00033672128 -0.0030898354 0.007890073 0.005923819 -0.001552973 0.001516021 0.0017856265 0.007822941 -0.009514211 -0.00020886083 0.0034666678 -0.00094713847 0.008384139 0.009009283 0.0065234327 -0.0007208324 0.007705209 -0.00853289 0.0032079336 -0.004625999 -0.0050743804 0.0035901158 0.005388813 0.007766254 -0.005744939 0.0074327383 0.006626378 -0.003704473 -0.008735958 0.005445474 0.0065230317 -0.000784768 -0.006700798 -0.007075852 -0.002488528 0.0051543443 -0.0036620772 -0.00938257 0.003815971 0.004890136 -0.0064404616 0.0012033634 -0.0020763231 2.994902e-05 -0.0098790005 0.002700701 -0.004756241 0.0011076172 -0.0015674155 0.0022046466 -0.00787344 -0.0027070795 0.002668326 0.0053478787 -0.002396734 -0.009512201 0.0045024394
4
+ mtdna 8.645293e-05 0.003076037 -0.006815487 -0.0013743688 0.0076927417 0.0073529496 -0.0036715195 0.0026677884 -0.008309281 0.00619759 -0.00463892 -0.0031715294 0.009313415 0.00088058383 0.0074962615 -0.00608139 0.005167896 0.009930803 -0.008471472 -0.0051321597 -0.007057574 -0.0048644566 -0.003772668 -0.008518714 0.0079532955 -0.0048361127 0.008438283 0.005270068 -0.0065578814 0.0039592343 0.005482614 -0.007444929 -0.0074228924 -0.002492343 -0.008628872 -0.0015748737 -0.00038757667 0.0032959366 0.0014325404 -0.00088083016 -0.005591098 0.0017297626 -0.00089552783 0.0068030986 0.0039881677 0.004533183 0.0014284542 -0.0027126821 -0.0043595196 -0.0010315293 0.0014437438 -0.0026617546 -0.0070882514 -0.007825746 -0.009136036 -0.005931676 -0.001850123 -0.004323682 -0.0064626597 -0.0037265678 0.004296681 -0.0037233941 0.008404572 0.001539496 -0.007246572 0.009443451 0.007636867 0.0055208146 -0.0068550883 0.0058190743 0.004034045 0.005188155 0.0042629624 0.0019477821 -0.003167882 0.008342064 0.009619138 0.0038047181 -0.0028461283 5.6938893e-07 0.0012001555 -0.0084682545 -0.008234347 -0.00023238244 0.0012304098 -0.005750644 -0.0047139754 -0.0073490315 0.008316314 0.00010242269 -0.004513882 0.005704978 0.009199796 -0.004097329 0.007985275 0.005386452 0.0058861696 0.0005043713 0.008208188 -0.0070221694
5
+ in -0.008226077 0.009303831 -0.00018710589 -0.0019704443 0.0046143015 -0.004104392 0.0027394402 0.006979235 0.0060486975 -0.0075411424 0.00939576 0.00465202 0.004012172 -0.006245291 0.008499353 -0.002164537 0.008836197 -0.005347778 -0.008136817 0.006804632 0.0016640095 -0.0022142953 0.009522269 0.009494823 -0.0097868545 0.0025105644 0.0061560757 0.0038842657 0.0020310257 0.00043876152 0.00068163266 -0.0038464246 -0.007141551 -0.0020813115 0.003930752 0.008838634 0.009274302 -0.0059668766 -0.009419525 0.009759848 0.0034291998 0.005158939 0.006265811 -0.0027623416 0.007310359 0.0027998323 0.0028576967 -0.0023982434 -0.003139742 -0.0023701421 0.0042809984 4.8589092e-05 -0.009614385 -0.00968607 -0.006160773 -0.00011437661 0.0019819876 0.009428 0.0056011924 -0.004298171 0.00026028603 0.004974084 0.007744428 -0.001135339 0.004278759 -0.0057750097 -0.0008068469 0.00811882 -0.002369315 -0.009674972 0.0058119837 -0.0039038642 -0.001220125 0.010017389 -0.002241946 -0.0047185957 -0.0053141676 0.0069846674 -0.005741993 0.002120917 -0.0052751247 0.00613608 0.0043662013 0.0026298608 -0.0015129133 -0.002735619 0.008999614 0.0052172863 -0.0021470466 -0.009465257 -0.007413552 -0.0010587372 -0.00078251073 -0.0025414668 0.009710779 -0.00044944565 0.005915 -0.007467981 -0.0024928953 -0.005583053
6
+ european -0.007147033 0.0012623417 -0.007189088 -0.0022513974 0.0037773554 0.005857864 0.0012027922 0.0021598793 -0.004109796 0.007198152 -0.006319537 0.0046250015 -0.008186181 0.0020334523 -0.0049318667 -0.0042960607 -0.0030848773 0.0056965156 0.0057683894 -0.004991361 0.00076802005 -0.008515792 0.0078122346 0.009295911 -0.002746969 0.0008081935 0.0007694419 0.00550255 -0.008630911 0.0006062931 0.0068933573 0.0021813295 0.0010798875 -0.009366349 0.008471645 -0.006258249 -0.0029761735 0.0035168754 -0.00078163494 0.0014152499 0.0017921324 -0.006839617 -0.009737293 0.009092817 0.0062128166 -0.00694695 0.0033956417 0.00017217748 0.004755041 -0.0071203653 0.004067516 0.004303939 0.009927 -0.0045391554 -0.0014395243 -0.0073114103 -0.009704934 -0.009090646 -0.0010375449 -0.0065315044 0.0048550633 -0.006148244 0.0026037877 0.000752482 -0.0034296552 -0.00092229253 0.010017935 0.009206015 -0.004494388 0.009070265 -0.0055859834 0.0059493524 -0.0030818144 0.0034673577 0.003029479 0.0069394265 -0.0023470228 0.008820008 0.0075530927 -0.009551933 -0.008064042 -0.007652859 0.0029148757 -0.0027951996 -0.00694831 -0.008136711 0.008356287 0.0019903474 -0.00933717 -0.004817203 0.0031394493 -0.0046995636 0.005327329 -0.0042287502 0.0027155946 -0.008033582 0.0062630265 0.0047997306 0.00079031993 0.0029888113
7
+ common -0.008722234 0.0021272295 -0.0008539916 -0.009321866 -0.0094246445 -0.001412531 0.0044288053 0.00372704 -0.006505282 -0.006894708 -0.0049991854 -0.0023061878 -0.007229156 -0.009607243 -0.0027377736 -0.008360431 -0.0060269493 -0.005675304 -0.00234906 -0.0017278373 -0.008954683 -0.000731004 0.008155364 0.007693106 -0.007208155 -0.003644954 0.0031189725 -0.009568674 0.0014795078 0.0065395026 0.0057490384 -0.008770905 -0.0045228535 -0.008156553 4.5400484e-05 0.00927559 0.005980464 0.0050585535 0.0050439127 -0.0032448657 0.009562716 -0.0073605715 -0.0072781076 -0.002255642 -0.00077679846 -0.0032283778 -0.00060498127 0.007476424 -0.00070291053 -0.0016193221 0.002749461 -0.008367007 0.0078366995 0.008528508 -0.009591924 0.0024459555 0.009891981 -0.007673955 -0.006969234 -0.0077365288 0.008389148 -0.00067644875 0.009162579 -0.008137346 0.0037369097 0.0026538277 0.0007320811 0.002340243 -0.007473436 -0.009367513 0.0023810826 0.0061679846 0.007993824 0.005740968 -0.00078188477 0.008307063 -0.009312772 0.0033975116 0.00027130058 0.003872196 0.007375048 -0.0067289495 0.005584901 -0.0095183 -0.0008194822 -0.008691651 -0.0050952802 0.009296191 -0.0018460032 0.0029113942 0.009088126 0.008946764 -0.008196811 -0.0030016953 0.009896215 0.005113277 -0.0015862831 -0.008699891 0.0029696936 -0.0066840183
8
+ sequence 0.008134779 -0.0044588344 -0.0010699655 0.001010431 -0.00018677961 0.0011458534 0.0061133304 -1.2402037e-05 -0.0032534893 -0.0015101052 0.0058955555 0.0015073137 -0.0007181427 0.009341042 -0.004917502 -0.0008413052 0.009177319 0.0067567485 0.0015022643 -0.0088886535 0.0011522508 -0.0022903979 0.009365224 0.0012041465 0.0014943897 0.0024040388 -0.0018358674 -0.004996856 0.00023002276 -0.0020175653 0.0066060103 0.008935089 -0.0006746635 0.0029776676 -0.0061099143 0.0017025766 -0.006924371 -0.008690522 -0.005899618 -0.008961226 0.0072769034 -0.005776607 0.00827455 -0.007233702 0.003422895 0.009676102 -0.0077943387 -0.009949275 -0.0043248134 -0.0026828882 -0.0002740396 -0.008833413 -0.008620106 0.0027985822 -0.008205106 -0.009067738 -0.0023404285 -0.00863584 -0.007056119 -0.008398832 -0.0003011976 -0.0045611723 0.006630901 0.0015288803 -0.0033471577 0.006116343 -0.0060124504 -0.004648673 -0.0072044823 -0.0043340866 -0.0018032556 0.00649206 -0.0027680297 0.004921421 0.006912646 -0.007459126 0.004573438 0.006129695 -0.002956148 0.0066218316 0.006121442 -0.0064460207 -0.0067676785 0.002543585 -0.0016248615 -0.006062931 0.009498339 -0.005135456 -0.006549685 -0.000118091535 -0.002699267 0.00044816377 -0.0035289875 -0.00041692218 -0.00070437486 0.00083035015 0.0081978375 -0.005737508 -0.0016556873 0.005569238
9
+ bru18 0.008155276 -0.0044185193 0.008987652 0.008259665 -0.0044238693 0.00031090993 0.004277394 -0.0039252234 -0.0055654007 -0.006509729 -0.0006656875 -0.00030213682 0.004489389 -0.0024855223 -0.00015437756 0.0024471143 0.0048732683 -2.8606542e-05 -0.0063628056 -0.009279111 1.8654398e-05 0.006667726 0.0014650559 -0.0089674555 -0.007945727 0.006548857 -0.0037690091 0.006254232 -0.0067004655 0.008482541 -0.0065189763 0.0032740948 -0.001067833 -0.0067885593 -0.0032949874 -0.0011434925 -0.005471747 -0.001204045 -0.0075744605 0.0026601462 0.009080238 -0.0023750134 -0.0009867329 0.0035252234 0.008680149 -0.0059299506 -0.006889695 -0.002942458 0.00913801 0.0008666254 -0.008663911 -0.001442217 0.009477263 -0.0075691855 -0.0053729587 0.009308613 -0.008970956 0.0038234547 0.00065334333 0.0066515543 0.008311967 -0.002862157 -0.003982641 0.008891435 0.0020839446 0.0062542376 -0.009450494 0.0095988605 -0.0013514485 -0.006062315 0.0029950105 -0.0004512243 0.0047055846 -0.0022705523 -0.004145877 0.0022992992 0.008370594 -0.004990823 0.0026696166 -0.00798221 -0.0067810714 -0.000469271 -0.008768882 0.0027844147 0.0015907697 -0.0023179457 0.005011737 0.009743466 0.008472866 -0.001870301 0.0020416898 -0.0039901678 -0.008234559 0.0062697986 -0.0019247098 -0.00066059735 -0.0017619281 -0.004536765 0.004069 -0.0042896206
10
+ bru50 -0.009579504 0.008948466 0.0041579367 0.00923892 0.006649052 0.0029269105 0.009801864 -0.0044190143 -0.0068119396 0.004226486 0.0037328962 -0.005664456 0.009715384 -0.0035591167 0.009558758 0.00083636935 -0.006334789 -0.0019748765 -0.007390546 -0.002990235 0.0010405012 0.009480547 0.009361016 -0.0065955063 0.0034724285 0.0022746115 -0.0024764987 -0.009228658 0.0010185506 -0.008164371 0.0063289437 -0.0058100903 0.005530614 0.009826734 -0.00015984276 0.0045368825 -0.0018012718 0.0073676347 0.0039300686 -0.0090082595 -0.0023973046 0.0036249864 -0.00010732573 -0.0011888575 -0.0010430571 -0.0016724848 0.00059902505 0.0041630277 -0.004250072 -0.0038341933 -5.2427928e-05 0.00026678806 -0.00017553278 -0.0047934647 0.0043008197 -0.002173452 0.0020970574 0.00065915886 0.005959963 -0.0068526124 -0.00680708 -0.004473089 0.009448878 -0.001590459 -0.009438289 -0.000534792 -0.0044530216 0.0060103727 -0.009585406 0.002857136 -0.009246552 0.001258808 0.0059965253 0.0074065947 -0.007623657 -0.0060443347 -0.006831209 -0.007910946 -0.009496376 -0.0021281417 -0.0008362788 -0.007265241 0.0067816544 0.0011141741 0.0058228294 0.0014675015 0.00078702695 -0.007366497 -0.0021715113 0.0043177926 -0.005089294 0.001137756 0.0028883398 -0.0015285894 0.009943532 0.008348668 0.0024183327 0.007110643 0.005890512 -0.005592114
11
+ vietnam -0.005153963 -0.0066644135 -0.007776157 0.0083126435 -0.0019782323 -0.006856599 -0.004155673 0.0051580225 -0.0028790692 -0.0037560624 0.0016262402 -0.00278304 -0.001570952 0.0010760438 -0.002967586 0.008515032 0.003917556 -0.009953211 0.0062494674 -0.0067655 0.00076895714 0.0043992978 -0.005096968 -0.0021128112 0.00809259 -0.0042428537 -0.0076304777 0.009258844 -0.0021577128 -0.004717085 0.008580298 0.004269408 0.004324098 0.009280228 -0.008452614 0.0052631963 0.0020472223 0.004193831 0.0016919046 0.004460046 0.0044873925 0.0060984488 -0.0032084621 -0.0045590503 -0.0004232687 0.002529075 -0.0032731881 0.006051339 0.0041546253 0.00776509 0.002568826 0.008108382 -0.0013972289 0.008070817 0.003707151 -0.008045609 -0.00393531 -0.0024772724 0.004889826 -0.00087688275 -0.00282919 0.007839672 0.009338199 -0.0016121961 -0.0051723607 -0.0046861414 -0.0048465827 -0.0095901145 0.0013706182 -0.0042283125 0.002539541 0.0056244545 -0.00406352 -0.009583576 0.0015531465 -0.006689678 0.0025049727 -0.0037749638 0.007073151 0.00063951715 0.0035553342 -0.0027433916 -0.001711565 0.007655947 0.0014000075 -0.005851 -0.007834303 0.0012315387 0.006458937 0.0055561876 -0.00897213 0.008598417 0.0040550055 0.007476387 0.00975736 -0.007282407 -0.009030263 0.0058277464 0.009392481 0.0034955258
12
+ sample 0.007100903 -0.0015709094 0.007947078 -0.00948947 -0.00802812 -0.006650821 -0.004002562 0.00500194 -0.0038224515 -0.008330948 0.00841617 -0.0037529538 0.008619977 -0.004892141 0.003931126 0.004920354 0.0023956115 -0.0028135795 0.0028564015 -0.008257614 -0.0027645228 -0.0026008752 0.007249391 -0.0034709626 -0.0066022277 0.0043369113 -0.0004823991 -0.0035912786 0.006893536 0.003869671 -0.0038965137 0.0007677057 0.009145668 0.0077625574 0.0063656354 0.004670941 0.0023901698 -0.0018358309 -0.006370667 -0.00030689163 -0.0015674513 -0.00057719386 -0.0062623145 0.0074473424 -0.0066001806 -0.007243944 -0.0027626618 -0.0015170419 -0.007635178 0.0006969715 -0.005330137 -0.0012829994 -0.007370956 0.0019601034 0.003276234 -1.4737604e-05 -0.005451358 -0.001723771 0.00709824 0.003738 -0.008888436 -0.0034084066 0.0023648455 0.0021412992 -0.009477984 0.004583573 -0.008656226 -0.007383396 0.0034825006 -0.0034719554 0.0035707187 0.008896884 -0.003571185 0.009332037 0.0017215977 0.009857596 0.005704204 -0.009146731 -0.0033407472 0.0065290304 0.0055978918 0.008714949 0.0069304765 0.008049887 -0.009821734 0.004303451 -0.0050309277 0.0035138857 0.0060621244 0.0043927776 0.007520648 0.0014953684 -0.0012639741 0.0057787485 -0.0056348047 4.0551466e-05 0.009468461 -0.005486985 0.0038199269 -0.008121091
13
+ collected 0.0097750295 0.008170629 0.0012814446 0.0051154387 0.0014172737 -0.006454876 -0.0014259414 0.0064561926 -0.004619688 -0.0039992593 0.004923175 0.0027045405 -0.0018415204 -0.0028716852 0.006021755 -0.005721393 -0.003250512 -0.0064803455 -0.0042360183 -0.008592084 -0.004467861 -0.008505252 0.0013975133 -0.008609542 -0.009919709 -0.008202052 -0.0067797694 0.006683116 0.0037784956 0.0003495915 -0.002959815 -0.007438984 0.0005348175 0.0005005026 0.00019596443 0.0008583165 0.00078985846 -5.4285138e-05 -0.008013045 -0.005872034 -0.00837931 -0.0013207265 0.0018039295 0.0074345516 -0.001966708 -0.0023440684 0.009481904 7.425008e-05 -0.0023982543 0.008607863 0.0026964454 -0.0053582233 0.0065950346 0.0045082304 -0.0070585674 -0.00031050213 0.00083163293 0.005739447 -0.0017207591 -0.0028131874 0.0017429565 0.00085032795 0.0012085037 -0.002637083 -0.0060016937 0.007339091 0.0075857476 0.00830421 -0.008602928 0.0026385786 -0.0035621128 0.0096288975 0.0029010975 0.004643974 0.0023910597 0.006626162 -0.005746352 0.007899223 -0.0024186398 -0.0045691207 -0.0020768652 0.009735589 -0.0068560173 -0.0021970137 0.006994984 -4.366915e-05 -0.0062879827 -0.006398747 0.008941079 0.0064397687 0.004773856 -0.003261329 -0.009269935 0.0038002136 0.0071752095 -0.0056398017 -0.007860231 -0.0029721109 -0.0049388385 -0.0023143636
14
+ europe -0.0019466967 -0.005264445 0.009446078 -0.009301849 0.00450806 0.005410841 -0.0014122794 0.009008321 0.009883694 -0.0054709506 -0.0060238987 -0.006749262 -0.007891144 -0.0030501 -0.00559189 -0.008350158 0.000785714 0.002999436 0.0064088805 -0.0026336086 -0.0044599404 0.0012484614 0.00038998463 0.008114584 0.00018636887 0.0072303875 -0.008259172 0.008436813 -0.0018950498 0.008705898 -0.007616939 0.0017924334 0.0010528992 4.4615095e-05 -0.005109563 -0.009249746 -0.0072665187 -0.007951877 0.0019136231 0.00048003704 -0.0018163731 0.007123826 -0.0024782037 -0.0013449806 -0.008898934 -0.0099250255 0.008953352 -0.0057566464 -0.006378906 0.0052002883 0.0066733453 -0.0068328637 0.000956345 -0.0060142023 0.0016413335 -0.004295812 -0.0034417375 0.0021831726 0.008657248 0.0067267795 -0.00967649 -0.0056275628 0.007884859 0.0019889344 -0.0042598336 0.0006024022 0.009526292 -0.0011015745 -0.009430234 0.0016114928 0.0062343916 0.00628738 0.0040935944 -0.0056507527 -0.000374705 -4.9610684e-05 0.004579015 -0.0080420235 -0.008019654 0.0002663556 -0.008607854 0.005816331 -0.00042231655 0.00997148 -0.0053460747 -0.00048954826 0.0077552027 -0.004073562 -0.0050113807 0.0015921831 0.0026467363 -0.0025611357 0.006453244 -0.0076659652 0.003398472 0.00049256504 0.008736541 0.0059848153 0.006820848 0.007819741
15
+ ancient -0.00949331 0.009558393 -0.0077741044 -0.0026378995 -0.0048897555 -0.0049655624 -0.008022211 -0.007766241 -0.0045622233 -0.0012816157 -0.0051147 0.0061208857 -0.009519694 -0.005296118 0.009434444 0.0069931676 0.0076746074 0.0042455657 0.0005105317 -0.0060022003 0.006030395 0.002638317 0.007692142 0.0063923756 0.0079497155 0.008663229 -0.009898174 -0.006753931 0.0013303582 0.0064388 0.0073839277 0.0055065546 0.007657052 -0.0051452103 0.006578382 -0.004109781 -0.009049926 0.009156881 0.0013312489 -0.0027684697 -0.0024686211 -0.004237798 0.004802247 0.00442113 -0.0026455545 -0.0073452652 -0.0035828727 -0.00034474322 0.006112652 -0.0028318586 -0.00011603545 0.0008713841 -0.007088451 0.0020616641 -0.0014378024 0.0028043352 0.0048393123 -0.0013679614 -0.0027919079 0.0077378284 0.005049118 0.006718327 0.0045309924 0.00867961 0.0074680797 -0.0010581953 0.008750674 0.0046186065 0.0054406407 -0.0013790869 -0.0020325198 -0.0044157715 -0.008505952 0.0030342783 0.008892043 0.0089222565 -0.0019243953 0.0060931933 0.0037896668 -0.0043041655 0.002026212 -0.005454141 0.008199508 0.005422219 0.003183278 0.0041012214 0.008660769 0.007268954 -0.0008326238 -0.0070764753 0.008396081 0.0072427383 0.0017482204 -0.0013339228 -0.0058783586 -0.004530154 0.008643081 -0.003131084 -0.006341318 0.009878559
16
+ neanderthal 0.007692736 0.009126856 0.001134214 -0.008323363 0.008438394 -0.0036978398 0.005743373 0.0044079996 0.0096743805 -0.009301011 0.009201668 -0.009297726 -0.0068989955 -0.009099583 -0.0055382987 0.0073707746 0.009167804 -0.0033190295 0.0037136457 -0.0036417823 0.007886165 0.0058672884 4.5112392e-06 -0.0036315187 -0.0072244583 0.0047761244 0.0014634884 -0.002615084 0.007832942 -0.004045295 -0.00913638 -0.0022702827 0.00011177889 -0.006659164 -0.0054871286 -0.008484606 0.00924395 0.0074312175 -0.00030530593 0.0073675984 0.0079630045 -0.0007988404 0.0066030715 0.0037836921 0.0050928146 0.0072574555 -0.004751798 -0.0021930316 0.00087973 0.0042327694 0.0033078827 0.0050869007 0.004582786 -0.008444151 -0.0031969673 -0.007233252 0.009679768 0.0049946425 0.0001599608 0.0041068383 -0.0076482734 -0.0062929546 0.003092239 0.006544919 0.0039503933 0.006035828 -0.0019895614 -0.0033235473 0.00020525315 -0.0031931365 -0.005507259 -0.0077802544 0.0065467777 -0.0010795805 -0.0018928167 -0.007799526 0.009349405 0.00087477046 0.0017788016 0.0024914553 -0.0073950374 0.0016234348 0.0029714536 -0.008580277 0.0049522887 0.0024255016 0.0074964412 0.0050449395 -0.0030210917 -0.0071717766 0.007105708 0.0019140064 0.005210298 0.0063858717 0.0019259832 -0.0061174775 -5.528207e-06 0.008260976 -0.0060965912 0.009431074
17
+ modern -0.0071792696 0.0042354544 0.00216289 0.007438057 -0.0048900596 -0.0045788498 -0.0060949842 0.0033097882 -0.004507435 0.008506253 -0.0042799306 -0.009108578 -0.0047961376 0.0064152437 -0.006351414 -0.0052630682 -0.007296127 0.006024725 0.003365447 0.0028487756 -0.0031356772 0.00602019 -0.0061529716 -0.001984372 -0.0059886468 -0.0009987217 -0.0020279228 0.008489572 9.179515e-05 -0.0085772425 -0.0054273363 -0.0068765874 0.0026914866 0.00946441 -0.0058075436 0.008274624 0.008538083 -0.007054826 -0.008883825 0.009470304 0.008378029 -0.0046964334 -0.0067229234 0.007853816 0.003754884 0.008087255 -0.0075793806 -0.009526273 0.0015759452 -0.009809055 -0.004886255 -0.003462314 0.009610498 0.008620381 -0.002831389 0.005837147 0.008235405 -0.002257783 0.009542199 0.0071611865 0.0020309114 -0.0038430467 -0.005072538 -0.00304804 0.007877576 -0.0061799455 -0.0029184332 0.009190523 0.003460949 0.0060627563 -0.008025261 -0.00075433304 0.0055211782 -0.0046972577 0.0074892025 0.009333807 -0.00041072394 -0.0020574103 -0.00060545607 -0.0057792794 -0.0083910655 -0.0014910942 -0.0025447267 0.0043934747 -0.006866489 0.00542165 -0.006739068 -0.0078106844 0.008480591 0.008917766 -0.0034737175 0.0034897032 -0.005797486 -0.008738294 -0.0055089584 0.0067478465 0.0064329007 0.009427363 0.007059985 0.0067415633
18
+ human 0.0013073076 -0.009817197 0.0046000797 -0.00054215814 0.0063516907 0.0017917434 -0.0031376705 0.00779152 0.0015605913 4.5087592e-05 -0.004629277 -0.008477088 -0.0077653346 0.00868444 -0.0089293 0.009021215 -0.009282701 -0.00026340262 -0.0019013402 -0.008945062 0.008634705 0.006775237 0.0030073978 0.00484689 0.000119797296 0.009438227 0.007017406 -0.009846283 -0.0044378787 -0.0012810889 0.0030511408 -0.0043373024 0.0014413317 -0.007862512 0.002772104 0.0047001 0.004937028 -0.0031820575 -0.008430869 -0.009233454 -0.00072350266 -0.007335406 -0.0068239835 0.006137866 0.0071648457 0.0021028868 -0.00790615 -0.0057202103 0.008053211 0.0039317366 -0.0052275606 -0.007412702 0.00076265965 0.0034572822 0.002076003 0.0031028383 -0.0056280685 -0.0099016195 -0.0070258062 0.00023322599 0.0046109683 0.004535595 0.0018992841 0.0051839855 -0.000116945404 0.004136494 -0.009110944 0.0077172276 0.0061438708 0.0051303217 0.0072363587 0.0084579345 0.00074768433 -0.0017087719 0.0005303956 -0.009314834 0.008429295 -0.0063797934 0.008425091 -0.0042409054 0.0006248087 -0.009168093 -0.009569658 -0.007833339 -0.0077458574 0.00037962993 -0.0072201644 -0.004963075 -0.0052754995 -0.004289475 0.0070301695 0.004834569 0.008708495 0.0070971223 -0.0056847483 0.007253502 -0.009290819 -0.0025857396 -0.007757146 0.0042008474
19
+ genome 0.0018013249 0.0070483726 0.002941503 -0.006984167 0.0077269375 -0.005990631 0.008982948 0.0029859466 -0.0040263417 -0.0046959417 -0.004423949 -0.006166649 0.009397486 -0.0026410713 0.00779025 -0.009682492 0.0021134273 -0.001217051 0.007545118 -0.009060286 0.007431912 -0.005112224 -0.006022511 -0.0056468663 -0.0033655176 -0.0034046597 -0.0031906026 -0.007475777 0.0007148267 -0.0005725245 -0.0016790004 0.0037438255 -0.00763313 -0.0032234066 0.00514847 0.00855509 -0.009791086 0.0071872775 0.0052953 -0.003874173 0.008570203 -0.009222292 0.0072385296 0.0053781155 0.0012898272 -0.0051951176 -0.004179599 -0.003369767 0.0015944163 0.001581598 0.007396833 0.0099602975 0.008836587 -0.004008733 0.009636086 -0.00063042255 0.0048575792 0.0025363516 -0.0006256454 0.0036644523 -0.005330011 -0.0057551167 -0.007577021 0.0019176035 0.006513916 0.00090115983 0.0012633507 0.0031810037 0.008123854 -0.007687061 0.0022752027 -0.007455608 0.003715618 0.009514587 0.0075186947 0.006441567 0.008026117 0.006552105 0.0068467325 0.00869257 -0.0049556913 0.009209661 0.0050575286 -0.0021248695 0.008474546 0.005080482 0.009641399 0.0028190457 0.009884555 0.001195692 0.009130684 0.0035973836 0.006580412 -0.00361116 0.0068057566 0.007250423 -0.002115621 -0.0018615718 0.003625693 -0.0070385
20
+ shows 0.009741375 -0.009785563 -0.006502033 0.0027767855 0.0064354893 -0.005370729 0.0027519849 0.009131747 -0.006819064 -0.0061066505 -0.0049928115 -0.00368126 0.0018522884 0.009683641 0.00644354 0.00039165124 0.0024744181 0.00844649 0.009138178 0.005629969 0.005943013 -0.007629522 -0.0038295696 -0.005683565 0.0061836103 -0.00225932 -0.008786562 0.0076284255 0.008406309 -0.0033179314 0.009119112 -0.00073907804 -0.0036286868 -0.0003802314 0.00019241076 -0.0035078088 0.0028134247 0.005731432 0.006873956 -0.008905951 -0.0021951643 -0.0054816343 0.0075234827 0.0065075015 -0.0043688817 0.002324414 -0.0059516523 0.00023538349 0.00945961 -0.0026105444 -0.0051873005 -0.0074033006 -0.0029152564 -0.0008664178 0.0035291065 0.009743326 -0.0033921245 0.001903681 0.009692432 0.0015337794 0.0009810732 0.009802843 0.00930645 0.007710903 -0.006179333 0.009991138 0.005857104 0.009073708 -0.002001237 0.0033512171 0.0068392376 -0.0038913293 0.006648019 0.0025668114 0.009319553 -0.0030298685 -0.0031094935 0.0062168743 -0.00908894 -0.0072543155 -0.006503641 -0.00074380165 -0.002362113 0.0068256087 0.009239293 -0.00091146474 0.0014132133 0.002020571 -0.0020174456 -0.008035576 0.007445874 -0.004299319 0.004580612 0.009090945 0.0030486963 0.00313993 0.0040727276 -0.0027017219 0.0038345656 0.00033530922
21
+ variation 0.005626712 0.005497371 0.0018291199 0.0057494068 -0.008968078 0.0065593575 0.009225992 -0.0042071473 0.0016075504 -0.0052338815 0.0010582185 0.0027701687 0.008160736 0.00054401276 0.0025570584 0.001297735 0.008402523 -0.0057077026 -0.00626183 -0.0036275184 -0.0023005498 0.005041063 -0.008120357 -0.0028335357 -0.008197427 0.00514971 -0.0025680638 -0.009067107 0.0040717293 0.009017323 -0.0030376601 -0.0058385395 0.0030198884 -0.00043584823 -0.009979436 0.008417704 -0.0073388875 -0.004930407 -0.002657081 -0.0054523144 0.00171651 0.009712814 0.0045722723 0.008088603 -0.00047045827 0.0006449234 -0.002668352 -0.008779561 0.0034313034 0.0020933736 -0.009421854 -0.004968437 -0.009734099 -0.0057197916 0.0040645422 0.008642861 0.00411165 0.0023884643 0.008144778 -0.0011192096 -0.0013977134 -0.008746823 -0.00012579202 -0.0025675725 0.00038607715 0.007279662 -0.0070414604 -0.0039464748 -0.0066646053 -0.0035441148 -0.0033158315 0.002137121 0.0033281683 -0.004957187 -0.0045462907 0.0011386942 0.0054534827 0.0053736498 -0.0029685367 -0.0042665256 -0.005616647 -0.00054498314 0.001946373 0.0015253461 0.0073525296 -0.0027333724 -6.592393e-05 -0.0055276332 -0.0011700654 -0.0077119637 -0.0009593296 0.0013096749 -0.008594744 0.0087485835 -0.009207866 -0.009624677 -0.008511624 0.0073132683 0.0054655685 0.009249462
22
+ haplogroup 0.0025659278 0.00085168 -0.0025371916 0.00934742 0.0028080416 0.0041162586 -0.0011815964 0.00096541416 0.0066110776 -0.00074895076 0.0033208325 -0.00070219487 0.0052740807 0.003645613 0.0026175152 -0.0053456044 -0.004693721 0.004352339 -0.0059164464 -0.00020070269 -0.0006396672 0.0034715144 -0.008427317 0.0088428045 -0.0014485243 -0.005307692 0.0040584584 -0.001898596 -0.007778139 -0.0044734394 -0.0003679351 -0.0089815045 0.0005416724 0.002407686 -0.003227299 0.0025667753 0.0024930644 0.009990179 0.0014140693 0.0020159276 0.0027784512 -0.0020868885 -0.008718105 0.008073382 -0.0019698895 -0.009723993 -0.006550278 -0.0039781313 0.003948964 0.0050270366 0.0061098747 -0.006815141 0.00066107995 -0.0028290635 -0.0052407067 0.006984182 0.0039222264 -0.003121762 -0.008263934 -0.0051569464 -0.00065567193 0.0078113875 0.006122021 -0.008424067 -0.0096058855 0.0071855173 -0.0022900787 -0.0036282074 0.005704672 -0.0058300486 0.005136189 -0.00020829153 -0.0068513798 -0.00030139415 0.006364283 0.009325248 0.0022419153 0.0050703404 -0.0050120936 -0.0008110871 -0.005373588 0.0011743606 -0.0017981603 -0.0036161384 -0.0070382343 0.009639485 0.003012655 -0.0022897385 -0.0041911877 0.0076894285 -0.0064663296 0.0031200873 0.0008309826 0.008321212 0.0068888706 -0.0028947534 0.002593874 -0.0016730811 -0.009431767 -0.0026270088
23
+ h 0.0013225824 0.0065497826 0.009982806 0.009062454 -0.0079781795 0.0065080435 -0.0057147983 -0.0009299061 0.00047654507 0.0065626903 0.0044563343 0.0045750956 0.0095022535 0.00038496728 -0.0060190535 -0.006347197 0.0064362343 -0.005219293 -0.002869563 0.004042792 -0.002286449 -0.006022882 -0.0023193487 0.0012384101 0.0021826315 0.0061027543 -0.005193723 0.003081824 0.0072158594 0.0022087328 0.0054155486 -0.004879429 0.0061283903 -0.007640156 0.0034881763 -0.009306421 -0.0025874602 -0.00905658 -0.0016061858 -0.005364485 -0.0039271545 0.0011356737 0.002771372 -0.0014860439 -0.008151553 -0.0059441784 0.00080055697 -0.0039708167 -0.009422841 -0.0007733177 0.0066586556 0.005949332 -0.0099333245 0.0030846666 -0.006018299 -0.009179041 0.00015740465 -0.0003979007 -0.006993792 -0.0063003623 -0.0024212876 0.0071041975 -0.0074873487 0.0077126683 -0.000499351 0.001135528 0.009489626 0.0047690077 -0.0035878688 0.00373115 0.0035563034 0.0063642766 7.750339e-05 -0.0044055916 0.001321394 -0.005388977 0.0014417345 0.004943775 0.0051506218 0.009180272 -0.0075472356 -0.005428668 0.0064623333 0.0013423576 -0.0066391225 0.0008783591 0.0027003903 -0.0025289776 -0.004963421 0.0049924683 0.009631416 -0.0073435763 -7.912599e-05 -0.0025523733 -0.0063192695 -0.001368983 -0.005227159 0.009048553 -0.005790704 0.003674939
24
+ is -0.00023357147 0.004226683 0.0021067455 0.009996419 0.0006458492 -0.005461563 -0.0011838758 0.0020920378 -0.0033855627 -0.007853136 -0.005604329 -0.0067612384 0.006366702 0.0039265845 0.008232181 0.0065088123 -0.0061183744 0.002733512 0.008466464 0.0015833755 0.0030677342 0.0058010546 -0.008839754 0.009125629 0.0068226005 0.008512217 -0.0082233 0.0061861346 0.006626654 -0.0013528146 -0.0062799496 0.0053081806 -0.006868758 -0.005337174 0.0035091531 0.008081314 0.008700704 -0.0043939846 -0.0091931205 0.009603682 0.006290027 -0.0039766026 -0.008465367 -0.004691139 -0.0039542373 -0.0032808431 0.0008109401 -0.00030902817 -0.0031103012 -0.005998526 0.009428418 -0.004739384 -0.007274209 0.0076703983 0.0025008747 0.0086274175 -0.004468981 -0.0069012893 0.0009802914 -0.0011801491 -0.009394523 -0.0015968346 0.0030780574 0.006576642 0.0068287384 0.0032347892 -0.0044282703 -0.0018157784 -0.0039494233 0.0057785274 -0.006343468 0.002114367 -0.0013383601 -0.0057999003 -0.007236314 0.0058711045 -0.008345587 -0.00067066104 0.0028193784 0.00773521 -0.007315293 0.003294973 0.009805078 -0.0069755646 -0.003540081 0.005130921 0.005245436 0.0016209023 0.00797557 0.00082546985 0.0018813204 -0.0015988776 -0.008149317 0.0032639706 0.0019852505 -0.008730082 -0.0006569945 7.3046285e-05 -2.6318648e-06 0.008703764
25
+ mitochondrial -0.002508221 -0.0059015388 0.007485539 -0.007257687 -0.008965709 -0.0017888069 -0.008367486 0.00039139786 0.0019467709 -0.0024699308 -0.00644677 -0.00032192905 -0.0010975264 0.0034935323 0.008127049 0.0058537317 0.008440359 -0.0089677265 0.00944024 -0.002368706 0.008696626 0.0023858226 0.0035850583 -0.0095805535 -0.009488111 0.008984071 -0.002896514 0.0028174375 0.0064166263 -0.00029972216 0.00971954 -0.0010352092 -0.009671927 -0.0070548807 -0.0010439103 -0.008674508 0.0074211163 0.0036188734 -0.00874913 0.008480371 0.008929614 0.0058477637 0.0069070626 -0.009568968 0.0004927428 -0.009223568 -0.0036663204 0.00025142074 -0.0002807199 0.0014672013 0.0032786338 0.0021258853 0.005320648 0.0075189634 -0.005886681 0.007957336 0.005991082 0.009785411 0.0046226517 -0.0033269909 -0.0037473391 -0.00062982703 -0.0016548736 0.009871284 0.0011211695 0.00400867 0.0034179776 -0.008850507 0.006720342 0.008190563 -0.0016650181 0.0023356378 -0.0064802184 -0.006126035 0.0082164975 -0.0030429186 0.0067422306 0.001552869 -0.0019822652 0.0030546081 -0.004023311 -0.0017839139 0.0013798403 0.004887597 -0.0014078929 0.0006583137 -0.007930928 0.00949345 -0.008762073 0.007072499 0.0039040898 -0.0069980817 -0.005295161 -0.007937933 -0.0051285303 0.00707022 0.009641066 0.0021544741 0.0006394228 0.009524309
NER/word2Vec/testModel/test_model_updated.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b1b785c79991b857b364ee9863985eaf845087efb1aa40a6b9cfae3b2a50012
3
+ size 133
NER/word2Vec/word2vec.py CHANGED
@@ -1,3 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''WORD TO VECTOR'''
2
  import pandas as pd
3
  import json
@@ -10,8 +381,11 @@ from gensim.test.utils import common_texts
10
  from gensim.models.word2vec import Word2Vec
11
  from gensim.scripts.glove2word2vec import glove2word2vec
12
  from gensim.test.utils import datapath, get_tmpfile
 
 
13
  import sys
14
  import subprocess
 
15
  # can try multiprocessing to run quicker
16
  import multiprocessing
17
  import copy
@@ -32,18 +406,19 @@ class word2Vec():
32
  def __init__(self, nameFile=None, modelName=None):
33
  self.nameFile = nameFile
34
  self.modelName = modelName
 
 
35
  def spacy_similarity(self, word):
36
  # when use word2vec, try medium or large is better
37
  # maybe try odc similarity?
38
- nlp = spacy.load("en_core_web_lg")
39
- doc = nlp(word)
40
  for token1 in doc:
41
  for token2 in doc:
42
  print(token1.text, token2.text, token1.similarity(token2))
43
  pass
44
  # clean text before transform to corpus
45
  def cleanTextBeforeCorpus(self,oriText, doi=None):
46
- cl = cleanText.cleanGenText()
47
  #cl = cleanGenText()
48
  output = ""
49
  alreadyRemoveDoi = False
@@ -51,7 +426,7 @@ class word2Vec():
51
  # remove DOI
52
  if doi != None and doi in oriText:
53
  if alreadyRemoveDoi == False:
54
- newWord = cl.removeDOI(word,doi)
55
  if len(newWord) > 0 and newWord != word:
56
  alreadyRemoveDoi = True
57
  word = newWord
@@ -59,13 +434,13 @@ class word2Vec():
59
  # split the sticked words
60
  #word = cl.splitStickWords(word)
61
  # remove punctuation
62
- word = cl.removePunct(word,True)
63
  # remove URL
64
- word = cl.removeURL(word)
65
  # remove HTMLTag
66
- word = cl.removeHTMLTag(word)
67
  # remove tab, white space, newline
68
- word = cl.removeTabWhiteSpaceNewLine(word)
69
  # optional: remove stopwords
70
  #word = cl.removeStopWords(word)
71
  if len(word)>0:
@@ -75,16 +450,18 @@ class word2Vec():
75
  cleanOutput = ""
76
  remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
77
  if len(allText) > 0:
78
- corpusText = allText
79
- for pos in range(len(corpusText.split("\n\n"))):
80
- if len(corpusText.split("\n\n")[pos]) > 0:
81
- lines = corpusText.split("\n\n")[pos]
82
  for line in lines.split("\n"):
83
  if remove in line: line = line.replace(remove, "")
84
  clean_text = self.cleanTextBeforeCorpus(line, doi)
85
  cleanOutput += clean_text + "\n"
86
  cleanOutput += "\n\n"
87
  return cleanOutput
 
 
88
  def tableTransformToCorpusText(self, df, excelFile=None):
89
  # PDF, Excel, WordDoc
90
  #cl = cleanText.cleanGenText()
@@ -119,10 +496,10 @@ class word2Vec():
119
  try:
120
  df = pd.ExcelFile(excelFile)
121
  except:
122
- if filepath.endswith('.xls'):
123
- df = pd.read_excel(filepath, engine='xlrd')
124
  else:
125
- df = pd.read_excel(filepath, engine='openpyxl')
126
  sheetNames = df.sheet_names
127
  output = []
128
  if len(sheetNames) > 0:
@@ -142,7 +519,7 @@ class word2Vec():
142
  return corpus
143
  def helperRowTableToCorpus(self, textList):
144
  #cl = cleanGenText()
145
- cl = cleanText.cleanGenText()
146
  stopWords = ["NaN","Unnamed:","nan"]
147
  outputDF = []
148
  for line in textList:
@@ -154,9 +531,9 @@ class word2Vec():
154
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
155
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
156
  #word = cl.splitStickWords(word)
157
- word = cl.removePunct(word)
158
- word = " ".join(cl.removeStopWords(word))
159
- word = cl.removeTabWhiteSpaceNewLine(word)
160
  if len(word) > 1:
161
  if len(word.split(" ")) > 1:
162
  for x in word.split(" "):
@@ -170,7 +547,7 @@ class word2Vec():
170
  return outputDF
171
  def helperColTableToCorpus(self, dfList):
172
  #cl = cleanGenText()
173
- cl = cleanText.cleanGenText()
174
  stopWords = ["NaN","Unnamed:","nan"]
175
  outputDF = []
176
  # use the first length line as the column ref
@@ -186,9 +563,9 @@ class word2Vec():
186
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
187
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
188
  #word = cl.splitStickWords(word)
189
- word = cl.removePunct(word)
190
- word = " ".join(cl.removeStopWords(word))
191
- word = cl.removeTabWhiteSpaceNewLine(word)
192
  if len(word) > 1:
193
  if len(word.split(" ")) > 1:
194
  for x in word.split(" "):
@@ -216,21 +593,22 @@ class word2Vec():
216
  Mouse is an animal.
217
  Jerry is mouse.'''
218
  texts = {}
219
- cl = cleanText.cleanGenText()
220
  #cl = cleanGenText()
221
- for pos in range(len(corpusText.split("\n\n"))):
222
- if len(corpusText.split("\n\n")[pos]) > 0:
 
223
  texts["Paragraph "+str(pos)] = []
224
- lines = corpusText.split("\n\n")[pos]
225
  for line in lines.split("\n"):
226
  for l in line.split("."):
227
  if len(l) > 0:
228
- cl.removeTabWhiteSpaceNewLine(l)
229
  l = l.lower()
230
  newL = []
231
  for word in l.split(" "):
232
  if len(word) > 0:
233
- word = cl.removeStopWords(word)
234
  for w in word:
235
  if len(w) > 0 and w.isnumeric()==False:
236
  newL.append(w)
@@ -239,49 +617,86 @@ class word2Vec():
239
  if len(texts["Paragraph "+str(pos)]) == 0:
240
  del texts["Paragraph "+str(pos)]
241
  return texts
242
- def selectParaForWC(self,corpus):
243
- ''' corpus should be in the format:
244
- corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
245
- corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
 
 
246
  corSize = len(corpus)
247
- # less than 2000
248
- if 0 < corSize < 2000:
249
- window=3.5
250
- vector_size=75
251
- sample=1e-3
252
- negative=10
253
- epochs=10
254
- sg=1
255
- # 2000 - 100000
256
- elif 2000 <= corSize < 100000:
257
- window=3.5
258
- vector_size=75
259
- sample=1e-5
260
- negative=10
261
- epochs=10
262
- sg=1
263
- elif 100000 <=corSize < 1000000:
264
- window=7.5
265
- vector_size=150
266
- sample=1e-5
267
- negative=10
268
- epochs=6
269
- sg=0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  return window, vector_size, sample, negative, epochs, sg
271
- def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
272
- vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
273
- # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
 
274
  jsonFile = ""
275
  jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
 
 
 
276
  cores = multiprocessing.cpu_count()
277
  combinedCorpus = []
278
- window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
279
- if len(jsonFile) > 0:
280
- for key in jsonFile:
281
- combinedCorpus.extend(jsonFile[key])
 
 
 
 
282
  window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
283
- # # min_count=1 ensures all words are included
284
- '''w2vModel = Word2Vec(
 
 
 
 
 
 
 
285
  min_count=1,
286
  window=window,
287
  vector_size=vector_size,
@@ -291,43 +706,39 @@ class word2Vec():
291
  negative=negative,
292
  workers=cores-1,
293
  epochs = epochs,
294
- sg=sg)'''
295
- #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
296
- accept = False
297
- while not accept:
298
- if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
299
- try:
300
- w2vModel = Word2Vec(
301
- min_count=1,
302
- window=window,
303
- vector_size=vector_size,
304
- sample=sample,
305
- alpha=0.03,
306
- min_alpha=0.0007,
307
- negative=negative,
308
- workers=cores-1,
309
- epochs = epochs,
310
- sg=sg)
311
- w2vModel.build_vocab(combinedCorpus)
312
- w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
313
- accept = True
314
- except:
315
- for key in jsonFile:
316
- combinedCorpus.extend(jsonFile[key])
317
- window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
318
- print("next is " + str(len(combinedCorpus)))
319
- else:
320
- print("no parameter to train")
321
- break
322
- #w2vModel.build_vocab(combinedCorpus)
323
- #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
324
- #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
325
- #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
326
- w2vModel.save(saveFolder+"/"+modelName+".model")
327
- w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
328
- print("done w2v")
329
- else: print("no corpus to train")
330
  #return combinedCorpus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
332
  # might not be a meaningful keyword
333
  #stopWords = ["show"]
@@ -354,6 +765,32 @@ class word2Vec():
354
  results.append(moreNewResult)
355
  currN +=1'''
356
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  # adding our model into spacy
358
  # this deals with command line; but instead of using it, we write python script to run command line
359
  def loadWordVec(self,modelName,wordVec):
@@ -367,4 +804,5 @@ class word2Vec():
367
  modelName, # this modelName comes from the saved modelName of function trainWord2Vec
368
  "--vectors-loc",
369
  wordVec])
 
370
  print("done")
 
1
+ <<<<<<< HEAD
2
+ '''WORD TO VECTOR'''
3
+ import pandas as pd
4
+ import json
5
+ import gensim
6
+ import spacy
7
+ from DefaultPackages import openFile, saveFile
8
+ from NER import cleanText
9
+ from gensim.models.keyedvectors import KeyedVectors
10
+ from gensim.test.utils import common_texts
11
+ from gensim.models.word2vec import Word2Vec
12
+ from gensim.scripts.glove2word2vec import glove2word2vec
13
+ from gensim.test.utils import datapath, get_tmpfile
14
+ import sys
15
+ import subprocess
16
+ # can try multiprocessing to run quicker
17
+ import multiprocessing
18
+ import copy
19
+ sys.setrecursionlimit(1000)
20
+ # creat folder word2Vec
21
+ #! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
22
+ # create word2vec model
23
+ #model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
24
+ '''Some notes for this model
25
+ sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
26
+ a similar word to the word we are finding, so can we try to preprocess text so that
27
+ we make the corpus more effective and only contains the important words. Then when we
28
+ train the model, the important words will be seen as important. Or
29
+ when we already have the similar list of words, we can remove the words in there
30
+ that are stopwords/unnecessary words.'''
31
+ ### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
32
+ class word2Vec():
33
+ def __init__(self, nameFile=None, modelName=None):
34
+ self.nameFile = nameFile
35
+ self.modelName = modelName
36
+ def spacy_similarity(self, word):
37
+ # when use word2vec, try medium or large is better
38
+ # maybe try odc similarity?
39
+ nlp = spacy.load("en_core_web_lg")
40
+ doc = nlp(word)
41
+ for token1 in doc:
42
+ for token2 in doc:
43
+ print(token1.text, token2.text, token1.similarity(token2))
44
+ pass
45
+ # clean text before transform to corpus
46
+ def cleanTextBeforeCorpus(self,oriText, doi=None):
47
+ cl = cleanText.cleanGenText()
48
+ #cl = cleanGenText()
49
+ output = ""
50
+ alreadyRemoveDoi = False
51
+ for word in oriText.split(" "):
52
+ # remove DOI
53
+ if doi != None and doi in oriText:
54
+ if alreadyRemoveDoi == False:
55
+ newWord = cl.removeDOI(word,doi)
56
+ if len(newWord) > 0 and newWord != word:
57
+ alreadyRemoveDoi = True
58
+ word = newWord
59
+ # remove punctuation
60
+ # split the sticked words
61
+ #word = cl.splitStickWords(word)
62
+ # remove punctuation
63
+ word = cl.removePunct(word,True)
64
+ # remove URL
65
+ word = cl.removeURL(word)
66
+ # remove HTMLTag
67
+ word = cl.removeHTMLTag(word)
68
+ # remove tab, white space, newline
69
+ word = cl.removeTabWhiteSpaceNewLine(word)
70
+ # optional: remove stopwords
71
+ #word = cl.removeStopWords(word)
72
+ if len(word)>0:
73
+ output += word + " "
74
+ return output
75
+ def cleanAllTextBeforeCorpus(self, allText, doi=None):
76
+ cleanOutput = ""
77
+ remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
78
+ if len(allText) > 0:
79
+ corpusText = allText
80
+ for pos in range(len(corpusText.split("\n\n"))):
81
+ if len(corpusText.split("\n\n")[pos]) > 0:
82
+ lines = corpusText.split("\n\n")[pos]
83
+ for line in lines.split("\n"):
84
+ if remove in line: line = line.replace(remove, "")
85
+ clean_text = self.cleanTextBeforeCorpus(line, doi)
86
+ cleanOutput += clean_text + "\n"
87
+ cleanOutput += "\n\n"
88
+ return cleanOutput
89
+ def tableTransformToCorpusText(self, df, excelFile=None):
90
+ # PDF, Excel, WordDoc
91
+ #cl = cleanText.cleanGenText()
92
+ corpus = {}
93
+ # PDF or df
94
+ if excelFile == None:
95
+ if len(df) > 0:
96
+ try:
97
+ for i in range(len(df)):
98
+ # each new dimension/page is considered to be a sentence which ends with the period.
99
+ # each new line is a new list, and each new df is a new corpus
100
+ outputDF = []
101
+ text = df[i].values.tolist()
102
+ if len(text) > 0:
103
+ outputRowDF = self.helperRowTableToCorpus(text)
104
+ #outputColDF = self.helperColTableToCorpus(text)
105
+ outputDF.extend(outputRowDF)
106
+ #outputDF.extend(outputColDF)
107
+ if len(outputDF) > 0:
108
+ corpus["corpus" + str(i)] = outputDF
109
+ except:
110
+ outputDF = []
111
+ text = df.values.tolist()
112
+ if len(text) > 0:
113
+ outputRowDF = self.helperRowTableToCorpus(text)
114
+ #outputColDF = self.helperColTableToCorpus(text)
115
+ outputDF.extend(outputRowDF)
116
+ #outputDF.extend(outputColDF)
117
+ if len(outputDF) > 0:
118
+ corpus["corpus0"] = outputDF
119
+ else:
120
+ try:
121
+ df = pd.ExcelFile(excelFile)
122
+ except:
123
+ if filepath.endswith('.xls'):
124
+ df = pd.read_excel(filepath, engine='xlrd')
125
+ else:
126
+ df = pd.read_excel(filepath, engine='openpyxl')
127
+ sheetNames = df.sheet_names
128
+ output = []
129
+ if len(sheetNames) > 0:
130
+ for s in range(len(sheetNames)):
131
+ outputDF = []
132
+ with pd.ExcelFile(excelFile) as xls:
133
+ data = pd.read_excel(xls, sheetNames[s])
134
+ if sheetNames[s] != 'Evaluation Warning':
135
+ text = data.values.tolist()
136
+ if len(text) > 0:
137
+ outputRowDF = self.helperRowTableToCorpus(text)
138
+ #outputColDF = self.helperColTableToCorpus(text)
139
+ outputDF.extend(outputRowDF)
140
+ #outputDF.extend(outputColDF)
141
+ if len(outputDF) > 0:
142
+ corpus["corpus" + str(s)] = outputDF
143
+ return corpus
144
+ def helperRowTableToCorpus(self, textList):
145
+ #cl = cleanGenText()
146
+ cl = cleanText.cleanGenText()
147
+ stopWords = ["NaN","Unnamed:","nan"]
148
+ outputDF = []
149
+ for line in textList:
150
+ outputLine = []
151
+ for words in line:
152
+ words = str(words)
153
+ if len(words) > 0:
154
+ for word in words.split(" "):
155
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
156
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
157
+ #word = cl.splitStickWords(word)
158
+ word = cl.removePunct(word)
159
+ word = " ".join(cl.removeStopWords(word))
160
+ word = cl.removeTabWhiteSpaceNewLine(word)
161
+ if len(word) > 1:
162
+ if len(word.split(" ")) > 1:
163
+ for x in word.split(" "):
164
+ if len(x) > 1 and x.isnumeric()==False:
165
+ outputLine.append(x.lower())
166
+ else:
167
+ if word.isnumeric() == False:
168
+ outputLine.append(word.lower())
169
+ if len(outputLine) > 0:
170
+ outputDF.append(outputLine)
171
+ return outputDF
172
+ def helperColTableToCorpus(self, dfList):
173
+ #cl = cleanGenText()
174
+ cl = cleanText.cleanGenText()
175
+ stopWords = ["NaN","Unnamed:","nan"]
176
+ outputDF = []
177
+ # use the first length line as the column ref
178
+ for pos in range(len(dfList[0])):
179
+ outputLine = []
180
+ for line in dfList:
181
+ if pos < len(line):
182
+ words = line[pos]
183
+ words = str(words)
184
+ else: words = ""
185
+ if len(words) > 0:
186
+ for word in words.split(" "):
187
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
188
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
189
+ #word = cl.splitStickWords(word)
190
+ word = cl.removePunct(word)
191
+ word = " ".join(cl.removeStopWords(word))
192
+ word = cl.removeTabWhiteSpaceNewLine(word)
193
+ if len(word) > 1:
194
+ if len(word.split(" ")) > 1:
195
+ for x in word.split(" "):
196
+ if len(x) > 1 and x.isnumeric()==False:
197
+ outputLine.append(x.lower())
198
+ else:
199
+ if word.isnumeric() == False:
200
+ outputLine.append(word.lower())
201
+ if len(outputLine) > 0:
202
+ outputDF.append(outputLine)
203
+ return outputDF
204
+ # create a corpus
205
+ def createCorpusText(self, corpusText):
206
+ '''ex: "Tom is cat. Jerry is mouse."
207
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
208
+ # the output should be like this:
209
+ '''texts = {
210
+ "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
211
+ "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
212
+ }
213
+ '''
214
+ # separate paragraph
215
+ '''Ex: Cat is an animal. Tom is cat.
216
+
217
+ Mouse is an animal.
218
+ Jerry is mouse.'''
219
+ texts = {}
220
+ cl = cleanText.cleanGenText()
221
+ #cl = cleanGenText()
222
+ for pos in range(len(corpusText.split("\n\n"))):
223
+ if len(corpusText.split("\n\n")[pos]) > 0:
224
+ texts["Paragraph "+str(pos)] = []
225
+ lines = corpusText.split("\n\n")[pos]
226
+ for line in lines.split("\n"):
227
+ for l in line.split("."):
228
+ if len(l) > 0:
229
+ cl.removeTabWhiteSpaceNewLine(l)
230
+ l = l.lower()
231
+ newL = []
232
+ for word in l.split(" "):
233
+ if len(word) > 0:
234
+ word = cl.removeStopWords(word)
235
+ for w in word:
236
+ if len(w) > 0 and w.isnumeric()==False:
237
+ newL.append(w)
238
+ if len(newL)>0:
239
+ texts["Paragraph "+str(pos)].append(newL)
240
+ if len(texts["Paragraph "+str(pos)]) == 0:
241
+ del texts["Paragraph "+str(pos)]
242
+ return texts
243
+ def selectParaForWC(self,corpus):
244
+ ''' corpus should be in the format:
245
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
246
+ corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
247
+ corSize = len(corpus)
248
+ # less than 2000
249
+ if 0 < corSize < 2000:
250
+ window=3.5
251
+ vector_size=75
252
+ sample=1e-3
253
+ negative=10
254
+ epochs=10
255
+ sg=1
256
+ # 2000 - 100000
257
+ elif 2000 <= corSize < 100000:
258
+ window=3.5
259
+ vector_size=75
260
+ sample=1e-5
261
+ negative=10
262
+ epochs=10
263
+ sg=1
264
+ elif 100000 <=corSize < 1000000:
265
+ window=7.5
266
+ vector_size=150
267
+ sample=1e-5
268
+ negative=10
269
+ epochs=6
270
+ sg=0
271
+ return window, vector_size, sample, negative, epochs, sg
272
+ def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
273
+ vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
274
+ # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
275
+ jsonFile = ""
276
+ jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
277
+ cores = multiprocessing.cpu_count()
278
+ combinedCorpus = []
279
+ window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
280
+ if len(jsonFile) > 0:
281
+ for key in jsonFile:
282
+ combinedCorpus.extend(jsonFile[key])
283
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
284
+ # # min_count=1 ensures all words are included
285
+ '''w2vModel = Word2Vec(
286
+ min_count=1,
287
+ window=window,
288
+ vector_size=vector_size,
289
+ sample=sample,
290
+ alpha=0.03,
291
+ min_alpha=0.0007,
292
+ negative=negative,
293
+ workers=cores-1,
294
+ epochs = epochs,
295
+ sg=sg)'''
296
+ #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
297
+ accept = False
298
+ while not accept:
299
+ if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
300
+ try:
301
+ w2vModel = Word2Vec(
302
+ min_count=1,
303
+ window=window,
304
+ vector_size=vector_size,
305
+ sample=sample,
306
+ alpha=0.03,
307
+ min_alpha=0.0007,
308
+ negative=negative,
309
+ workers=cores-1,
310
+ epochs = epochs,
311
+ sg=sg)
312
+ w2vModel.build_vocab(combinedCorpus)
313
+ w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
314
+ accept = True
315
+ except:
316
+ for key in jsonFile:
317
+ combinedCorpus.extend(jsonFile[key])
318
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
319
+ print("next is " + str(len(combinedCorpus)))
320
+ else:
321
+ print("no parameter to train")
322
+ break
323
+ #w2vModel.build_vocab(combinedCorpus)
324
+ #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
325
+ #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
326
+ #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
327
+ w2vModel.save(saveFolder+"/"+modelName+".model")
328
+ w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
329
+ print("done w2v")
330
+ else: print("no corpus to train")
331
+ #return combinedCorpus
332
+ def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
333
+ # might not be a meaningful keyword
334
+ #stopWords = ["show"]
335
+ # same word but just plural nouns, tense
336
+ simWords = [word+"s",word+"es",word+"ing",word+"ed"]
337
+ model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
338
+ results = model.most_similar(positive=[word],topn=n)
339
+ #removeIndex = []
340
+ #currN = copy.deepcopy(n)
341
+ '''for r in range(len(results)):
342
+ if len(results[r][0]) < 2:
343
+ removeIndex.append(results[r])
344
+ # remove the same word but just plural and singular noun and lower than the cos_thres
345
+ elif results[r][0] == word:
346
+ removeIndex.append(results[r])
347
+ elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
348
+ removeIndex.append(results[r])
349
+ for rem in removeIndex:
350
+ results.remove(rem)
351
+ while len(results)!=n and len(results) != 0:
352
+ moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
353
+ if moreNewResult not in results and len(moreNewResult[0])>1:
354
+ if moreNewResult[0] not in stopWords and results[0] != word:
355
+ results.append(moreNewResult)
356
+ currN +=1'''
357
+ return results
358
+ # adding our model into spacy
359
+ # this deals with command line; but instead of using it, we write python script to run command line
360
+ def loadWordVec(self,modelName,wordVec):
361
+ # modelName is the name you want to save into spacy
362
+ # wordVec is the trained word2vec in txt format
363
+ subprocess.run([sys.executable,
364
+ "-m",
365
+ "spacy",
366
+ "init-model",
367
+ "en",
368
+ modelName, # this modelName comes from the saved modelName of function trainWord2Vec
369
+ "--vectors-loc",
370
+ wordVec])
371
+ =======
372
  '''WORD TO VECTOR'''
373
  import pandas as pd
374
  import json
 
381
  from gensim.models.word2vec import Word2Vec
382
  from gensim.scripts.glove2word2vec import glove2word2vec
383
  from gensim.test.utils import datapath, get_tmpfile
384
+ from gensim.models import Phrases
385
+ from gensim.models.phrases import Phraser
386
  import sys
387
  import subprocess
388
+ import os
389
  # can try multiprocessing to run quicker
390
  import multiprocessing
391
  import copy
 
406
  def __init__(self, nameFile=None, modelName=None):
407
  self.nameFile = nameFile
408
  self.modelName = modelName
409
+ #self.nlp = spacy.load("en_core_web_lg")
410
+ self.cl = cleanText.cleanGenText()
411
  def spacy_similarity(self, word):
412
  # when use word2vec, try medium or large is better
413
  # maybe try odc similarity?
414
+ doc = self.nlp(word)
 
415
  for token1 in doc:
416
  for token2 in doc:
417
  print(token1.text, token2.text, token1.similarity(token2))
418
  pass
419
  # clean text before transform to corpus
420
  def cleanTextBeforeCorpus(self,oriText, doi=None):
421
+ #cl = cleanText.cleanGenText()
422
  #cl = cleanGenText()
423
  output = ""
424
  alreadyRemoveDoi = False
 
426
  # remove DOI
427
  if doi != None and doi in oriText:
428
  if alreadyRemoveDoi == False:
429
+ newWord = self.cl.removeDOI(word,doi)
430
  if len(newWord) > 0 and newWord != word:
431
  alreadyRemoveDoi = True
432
  word = newWord
 
434
  # split the sticked words
435
  #word = cl.splitStickWords(word)
436
  # remove punctuation
437
+ word = self.cl.removePunct(word,True)
438
  # remove URL
439
+ word = self.cl.removeURL(word)
440
  # remove HTMLTag
441
+ word = self.cl.removeHTMLTag(word)
442
  # remove tab, white space, newline
443
+ word = self.cl.removeTabWhiteSpaceNewLine(word)
444
  # optional: remove stopwords
445
  #word = cl.removeStopWords(word)
446
  if len(word)>0:
 
450
  cleanOutput = ""
451
  remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
452
  if len(allText) > 0:
453
+ corpusText = allText.split("\n\n")
454
+ for pos in range(len(corpusText)):
455
+ lines = corpusText[pos]
456
+ if len(lines) > 0:
457
  for line in lines.split("\n"):
458
  if remove in line: line = line.replace(remove, "")
459
  clean_text = self.cleanTextBeforeCorpus(line, doi)
460
  cleanOutput += clean_text + "\n"
461
  cleanOutput += "\n\n"
462
  return cleanOutput
463
+ import urllib.parse, requests
464
+
465
  def tableTransformToCorpusText(self, df, excelFile=None):
466
  # PDF, Excel, WordDoc
467
  #cl = cleanText.cleanGenText()
 
496
  try:
497
  df = pd.ExcelFile(excelFile)
498
  except:
499
+ if excelFile.endswith('.xls'):
500
+ df = pd.read_excel(excelFile, engine='xlrd')
501
  else:
502
+ df = pd.read_excel(excelFile, engine='openpyxl')
503
  sheetNames = df.sheet_names
504
  output = []
505
  if len(sheetNames) > 0:
 
519
  return corpus
520
  def helperRowTableToCorpus(self, textList):
521
  #cl = cleanGenText()
522
+ #cl = cleanText.cleanGenText()
523
  stopWords = ["NaN","Unnamed:","nan"]
524
  outputDF = []
525
  for line in textList:
 
531
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
532
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
533
  #word = cl.splitStickWords(word)
534
+ word = self.cl.removePunct(word)
535
+ word = " ".join(self.cl.removeStopWords(word))
536
+ word = self.cl.removeTabWhiteSpaceNewLine(word)
537
  if len(word) > 1:
538
  if len(word.split(" ")) > 1:
539
  for x in word.split(" "):
 
547
  return outputDF
548
  def helperColTableToCorpus(self, dfList):
549
  #cl = cleanGenText()
550
+ #cl = cleanText.cleanGenText()
551
  stopWords = ["NaN","Unnamed:","nan"]
552
  outputDF = []
553
  # use the first length line as the column ref
 
563
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
564
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
565
  #word = cl.splitStickWords(word)
566
+ word = self.cl.removePunct(word)
567
+ word = " ".join(self.cl.removeStopWords(word))
568
+ word = self.cl.removeTabWhiteSpaceNewLine(word)
569
  if len(word) > 1:
570
  if len(word.split(" ")) > 1:
571
  for x in word.split(" "):
 
593
  Mouse is an animal.
594
  Jerry is mouse.'''
595
  texts = {}
596
+ #cl = cleanText.cleanGenText()
597
  #cl = cleanGenText()
598
+ corpus = corpusText.split("\n\n")
599
+ for pos in range(len(corpus)):
600
+ if len(corpus[pos]) > 0:
601
  texts["Paragraph "+str(pos)] = []
602
+ lines = corpus[pos]
603
  for line in lines.split("\n"):
604
  for l in line.split("."):
605
  if len(l) > 0:
606
+ l = self.cl.removeTabWhiteSpaceNewLine(l)
607
  l = l.lower()
608
  newL = []
609
  for word in l.split(" "):
610
  if len(word) > 0:
611
+ word = self.cl.removeStopWords(word)
612
  for w in word:
613
  if len(w) > 0 and w.isnumeric()==False:
614
  newL.append(w)
 
617
  if len(texts["Paragraph "+str(pos)]) == 0:
618
  del texts["Paragraph "+str(pos)]
619
  return texts
620
+
621
+ def selectParaForWC(self, corpus):
622
+ """
623
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
624
+ Heuristically determine Word2Vec parameters.
625
+ """
626
  corSize = len(corpus)
627
+
628
+ if corSize == 0:
629
+ return None, None, None, None, None, None
630
+
631
+ # Adjust parameters based on corpus size
632
+ if corSize < 2000:
633
+ # Small corpus — need high generalization
634
+ window = 3
635
+ vector_size = 100
636
+ sample = 1e-3
637
+ negative = 5
638
+ epochs = 20
639
+ sg = 1 # Skip-gram preferred for rare words
640
+ elif corSize < 10000:
641
+ window = 5
642
+ vector_size = 150
643
+ sample = 1e-4
644
+ negative = 10
645
+ epochs = 20
646
+ sg = 1
647
+ elif corSize < 100000:
648
+ window = 7
649
+ vector_size = 200
650
+ sample = 1e-5
651
+ negative = 15
652
+ epochs = 15
653
+ sg = 1
654
+ elif corSize < 500000:
655
+ window = 10
656
+ vector_size = 250
657
+ sample = 1e-5
658
+ negative = 15
659
+ epochs = 10
660
+ sg = 0 # CBOW is okay when data is large
661
+ else:
662
+ # Very large corpus
663
+ window = 12
664
+ vector_size = 300
665
+ sample = 1e-6
666
+ negative = 20
667
+ epochs = 5
668
+ sg = 0
669
+
670
  return window, vector_size, sample, negative, epochs, sg
671
+
672
+
673
+ def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
674
+ vector_size=None,sample=None,negative=None,epochs=None,sg=None):
675
  jsonFile = ""
676
  jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
677
+ if not jsonFile:
678
+ print("No corpus to train")
679
+ return
680
  cores = multiprocessing.cpu_count()
681
  combinedCorpus = []
682
+ for key in jsonFile:
683
+ combinedCorpus.extend(jsonFile[key])
684
+ # detect phrase before choosing parameters
685
+ phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
686
+ bigram = Phraser(phrases)
687
+ combinedCorpus = [bigram[sent] for sent in combinedCorpus]
688
+
689
+ if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
690
  window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
691
+ # # min_count=1 ensures all words are included
692
+ #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
693
+ accept = False
694
+ # add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
695
+ retries = 0
696
+ while not accept and retries < 3:
697
+ if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
698
+ try:
699
+ w2vModel = Word2Vec(
700
  min_count=1,
701
  window=window,
702
  vector_size=vector_size,
 
706
  negative=negative,
707
  workers=cores-1,
708
  epochs = epochs,
709
+ sg=sg)
710
+ w2vModel.build_vocab(combinedCorpus)
711
+ w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
712
+ accept = True
713
+ except Exception as e:
714
+ print(f"Retry #{retries+1} failed: {e}")
715
+ retries +=1
716
+ else:
717
+ print("no parameter to train")
718
+ break
719
+ #w2vModel.build_vocab(combinedCorpus)
720
+ #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
721
+ #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
722
+ #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
723
+ w2vModel.save(saveFolder+"/"+modelName+".model")
724
+ w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
725
+ print("done w2v")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  #return combinedCorpus
727
+ def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
728
+ if not newCorpus:
729
+ raise ValueError("New corpus is empty!")
730
+
731
+ model = Word2Vec.load(modelPath)
732
+
733
+ # Phrase detection on new data
734
+ phrases = Phrases(newCorpus, min_count=2, threshold=10)
735
+ bigram = Phraser(phrases)
736
+ newCorpus = [bigram[sent] for sent in newCorpus]
737
+
738
+ # Update vocab & retrain
739
+ model.build_vocab(newCorpus, update=True)
740
+ model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
741
+
742
  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
743
  # might not be a meaningful keyword
744
  #stopWords = ["show"]
 
765
  results.append(moreNewResult)
766
  currN +=1'''
767
  return results
768
+ # add more data to existing word2vec model
769
+ def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
770
+ if not newCorpus:
771
+ raise ValueError("New corpus is empty!")
772
+
773
+ model = Word2Vec.load(modelPath)
774
+
775
+ # Phrase detection on new data
776
+ phrases = Phrases(newCorpus, min_count=2, threshold=10)
777
+ bigram = Phraser(phrases)
778
+ newCorpus = [bigram[sent] for sent in newCorpus]
779
+
780
+ # Update vocab & retrain
781
+ model.build_vocab(newCorpus, update=True)
782
+ model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
783
+
784
+ # Save updated model
785
+ if saveFolder:
786
+ os.makedirs(saveFolder, exist_ok=True)
787
+ name = os.path.basename(modelPath).replace(".model", "_updated.model")
788
+ model.save(f"{saveFolder}/{name}")
789
+ print(f"🔁 Model updated and saved to {saveFolder}/{name}")
790
+ else:
791
+ model.save(modelPath)
792
+ print(f"🔁 Model updated and overwritten at {modelPath}")
793
+
794
  # adding our model into spacy
795
  # this deals with command line; but instead of using it, we write python script to run command line
796
  def loadWordVec(self,modelName,wordVec):
 
804
  modelName, # this modelName comes from the saved modelName of function trainWord2Vec
805
  "--vectors-loc",
806
  wordVec])
807
+ >>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI before moving to UpdateAppUI)
808
  print("done")
README.md CHANGED
@@ -1,15 +1,74 @@
1
- ---
2
- setup: bash setup.sh
3
- title: MtDNALocation
4
- emoji: 📊
5
- colorFrom: blue
6
- colorTo: purple
7
- sdk: gradio
8
- sdk_version: 5.25.0
9
- app_file: app.py
10
- pinned: false
11
- license: mit
12
- short_description: mtDNA Location Classification tool
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ setup: bash setup.sh
3
+ title: MtDNALocation
4
+ emoji: 📊
5
+ colorFrom: blue
6
+ colorTo: purple
7
+ sdk: gradio
8
+ sdk_version: 5.25.0
9
+ app_file: app.py
10
+ pinned: false
11
+ license: mit
12
+ short_description: mtDNA Location Classification tool
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
+
17
+ # Installation
18
+ ## Set up environments and start GUI:
19
+ ```bash
20
+ git clone https://github.com/Open-Access-Bio-Data/mtDNA-Location-Classifier.git
21
+ ```
22
+ If installed using mamba (recommended):
23
+ ```bash
24
+ mamba env create -f env.yaml
25
+ ```
26
+ If not, check current python version in terminal and make sure that it is python version 3.10, then run
27
+ ```bash
28
+ pip install -r requirements.txt
29
+ ```
30
+ To start the programme, run this in terminal:
31
+ ```bash
32
+ python app.py
33
+ ```
34
+ Then follow its instructions
35
+ # Descriptions:
36
+ mtDNA-Location-Classifier uses [Gradio](https://www.gradio.app/docs) to handle the front-end interactions.
37
+
38
+ The programme takes **an accession number** (an NCBI GenBank/nuccore identifier) as input and returns the likely origin of the sequence through `classify_sample_location_cached(accession=accession_number)`. This function wraps around a pipeline that proceeds as follow:
39
+ ## Steps 1-3: Check and retrieve base materials: the Pubmed ID, isolate, DOI and text:
40
+ - Which are respectively:
41
+
42
+ ### Step 1: pubmed_ids and isolates
43
+ `get_info_from accession(accession=accession_number)`
44
+ - Current input is a string of `accession_number` and output are two lists, one of PUBMED IDs and one of isolate(s).
45
+ - Which look through the metadata of the sequence with `accession_number` and extract `PUBMED ID` if available or `isolate` information.
46
+ - The presence of PUBMED ID is currently important for the retrieval of texts in the next steps, which are eventually used by method 4.1 (question-answering) and 4.2 (infer from haplogroup)
47
+ - Some sequences might not have `isolate` info but its availibity is optional. (as they might be used by method 4.1 and 4.2 as alternative)
48
+
49
+ ### Step 2: dois
50
+ `get_doi_from_pubmed_id(pubmed_ids = pubmed_ids)`
51
+ - Input is a list of PUBMED IDs of the sequence with `accession_number` (retrieved from previous step) and output is a dictionary with keys = PUBMED IDs and values = according DOIs.
52
+ - The pubmed_ids are retrieved from the `get_info_from accession(accession=accession_number)` mentioned above.
53
+ - The DOIs will be passed down to dependent functions to extract texts of publications to pass on to method 4.1 and 4.2
54
+
55
+ ### Step 3: get text
56
+ `get_paper_text(dois = dois)`
57
+ - Input is currently a list of dois retrieved from previous step and output is a dictionary with keys = sources (doi links or file type) (We might improve this to have other inputs in addition to just doi links - maybe files); values = texts obtained from sources.
58
+ - Output of this step is crucial to method 4.1 and 4.2
59
+
60
+
61
+ ## Step 4: Prediction of origin:
62
+ ### Method 4.0:
63
+ - The first method attempts to directly look in the metadata for information that was submitted along with the sequence. Thus, it does not require availability of PUBMED IDs/DOIs or isolates.
64
+ - However, this information is not always available in the submission. Thus, we use other methods (4.1,4.2) to retrieve publications through which we can extract the information of the source of mtDNA
65
+
66
+ ### Method 4.1:
67
+ -
68
+
69
+ ### Method 4.2:
70
+ -
71
+
72
+ ## More in the package
73
+ ### extraction of text from HTML
74
+ ### extraction of text from PDF
accessions.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Accession
2
+ KU131308
3
+ JX123456
4
+ MN908947
5
+ AB123456
6
+ AY123456
accessions.xlsx ADDED
Binary file (4.98 kB). View file
 
app.py CHANGED
@@ -1,3 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
2
 
3
  import gradio as gr
@@ -5,9 +179,15 @@ from collections import Counter
5
  import csv
6
  import os
7
  from functools import lru_cache
8
- from mtdna_classifier import classify_sample_location
9
  import subprocess
10
  import json
 
 
 
 
 
 
11
 
12
  @lru_cache(maxsize=128)
13
  def classify_sample_location_cached(accession):
@@ -33,8 +213,6 @@ def compute_final_suggested_location(rows):
33
  return counts, (top_location, count)
34
 
35
  # Store feedback (with required fields)
36
- import gspread
37
- from oauth2client.service_account import ServiceAccountCredentials
38
 
39
  '''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
40
 
@@ -58,11 +236,6 @@ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
58
  except Exception as e:
59
  return f"❌ Error submitting feedback: {str(e)}"'''
60
 
61
- import os
62
- import json
63
- from oauth2client.service_account import ServiceAccountCredentials
64
- import gspread
65
-
66
  def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
67
  if not answer1.strip() or not answer2.strip():
68
  return "⚠️ Please answer both questions before submitting."
@@ -84,16 +257,44 @@ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
84
  except Exception as e:
85
  return f"❌ Error submitting feedback: {e}"
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def summarize_results(accession):
89
  try:
90
- output = classify_sample_location_cached(accession)
91
  print(output)
92
  except Exception as e:
93
- return [], f"Error: {e}"
94
 
95
  if accession not in output:
96
- return [], "Accession not found in results."
97
 
98
  isolate = next((k for k in output if k != accession), None)
99
  row_score = []
@@ -110,7 +311,7 @@ def summarize_results(accession):
110
  haplogroup = content.get("haplogroup", "")
111
  inferred = content.get("inferred_location", "")
112
  context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
113
-
114
  row = {
115
  "Sample ID": sample_id_label,
116
  "Technique": technique,
@@ -130,43 +331,202 @@ def summarize_results(accession):
130
  summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
131
  summary = "\n".join(summary_lines)
132
 
133
- return rows, summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  # Gradio UI
135
  with gr.Blocks() as interface:
136
  gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
137
- gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  with gr.Row():
140
- accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
141
  run_button = gr.Button("🔍 Submit and Classify")
142
  reset_button = gr.Button("🔄 Reset")
143
 
144
  status = gr.Markdown(visible=False)
145
- headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
146
- output_table = gr.Dataframe(headers=headers, interactive=False)
147
- output_summary = gr.Markdown()
148
-
149
- gr.Markdown("---")
150
- gr.Markdown("### 💬 Feedback (required)")
151
- q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
152
- q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
153
- contact = gr.Textbox(label="📧 Your email or institution (optional)")
154
- submit_feedback = gr.Button("✅ Submit Feedback")
155
- feedback_status = gr.Markdown()
156
-
157
- def classify_with_loading(accession):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  return gr.update(value="⏳ Please wait... processing...", visible=True)
159
 
 
 
 
 
 
 
 
160
  def classify_main(accession):
161
- table, summary = summarize_results(accession)
162
- return table, summary, gr.update(visible=False)
 
 
 
 
 
 
 
163
 
164
  def reset_fields():
165
- return "", "", "", "", "", [], "", gr.update(visible=False)
166
-
167
- run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
168
- run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
169
- submit_feedback.click(fn=store_feedback_to_google_sheets, inputs=[accession, q1, q2, contact], outputs=feedback_status)
170
- reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  interface.launch(share=True)
 
 
1
+ <<<<<<< HEAD
2
+ # ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
3
+
4
+ import gradio as gr
5
+ from collections import Counter
6
+ import csv
7
+ import os
8
+ from functools import lru_cache
9
+ from mtdna_classifier import classify_sample_location
10
+ import subprocess
11
+ import json
12
+
13
+ @lru_cache(maxsize=128)
14
+ def classify_sample_location_cached(accession):
15
+ return classify_sample_location(accession)
16
+
17
+ # Count and suggest final location
18
+ def compute_final_suggested_location(rows):
19
+ candidates = [
20
+ row.get("Predicted Location", "").strip()
21
+ for row in rows
22
+ if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
23
+ ] + [
24
+ row.get("Inferred Region", "").strip()
25
+ for row in rows
26
+ if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
27
+ ]
28
+
29
+ if not candidates:
30
+ return Counter(), ("Unknown", 0)
31
+
32
+ counts = Counter(candidates)
33
+ top_location, count = counts.most_common(1)[0]
34
+ return counts, (top_location, count)
35
+
36
+ # Store feedback (with required fields)
37
+ import gspread
38
+ from oauth2client.service_account import ServiceAccountCredentials
39
+
40
+ '''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
41
+
42
+ scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
43
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
44
+
45
+ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
46
+ if not answer1.strip() or not answer2.strip():
47
+ return "⚠️ Please answer both questions before submitting."
48
+
49
+ try:
50
+ # Define the scope and authenticate
51
+ scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
52
+ creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
53
+ client = gspread.authorize(creds)
54
+
55
+ # Open the spreadsheet and worksheet
56
+ sheet = client.open("feedback_mtdna").sheet1 # You can change the name
57
+ sheet.append_row([accession, answer1, answer2, contact])
58
+ return "✅ Feedback submitted. Thank you!"
59
+ except Exception as e:
60
+ return f"❌ Error submitting feedback: {str(e)}"'''
61
+
62
+ import os
63
+ import json
64
+ from oauth2client.service_account import ServiceAccountCredentials
65
+ import gspread
66
+
67
+ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
68
+ if not answer1.strip() or not answer2.strip():
69
+ return "⚠️ Please answer both questions before submitting."
70
+
71
+ try:
72
+ # ✅ Step: Load credentials from Hugging Face secret
73
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
74
+ scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
75
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
76
+
77
+ # Connect to Google Sheet
78
+ client = gspread.authorize(creds)
79
+ sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
80
+
81
+ # Append feedback
82
+ sheet.append_row([accession, answer1, answer2, contact])
83
+ return "✅ Feedback submitted. Thank you!"
84
+
85
+ except Exception as e:
86
+ return f"❌ Error submitting feedback: {e}"
87
+
88
+
89
+ def summarize_results(accession):
90
+ try:
91
+ output = classify_sample_location_cached(accession)
92
+ print(output)
93
+ except Exception as e:
94
+ return [], f"❌ Error: {e}"
95
+
96
+ if accession not in output:
97
+ return [], "❌ Accession not found in results."
98
+
99
+ isolate = next((k for k in output if k != accession), None)
100
+ row_score = []
101
+ rows = []
102
+
103
+ for key in [accession, isolate]:
104
+ if key not in output:
105
+ continue
106
+ sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
107
+ for section, techniques in output[key].items():
108
+ for technique, content in techniques.items():
109
+ source = content.get("source", "")
110
+ predicted = content.get("predicted_location", "")
111
+ haplogroup = content.get("haplogroup", "")
112
+ inferred = content.get("inferred_location", "")
113
+ context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
114
+
115
+ row = {
116
+ "Sample ID": sample_id_label,
117
+ "Technique": technique,
118
+ "Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
119
+ "Predicted Location": "" if technique == "haplogroup" else predicted,
120
+ "Haplogroup": haplogroup if technique == "haplogroup" else "",
121
+ "Inferred Region": inferred if technique == "haplogroup" else "",
122
+ "Context Snippet": context
123
+ }
124
+
125
+ row_score.append(row)
126
+ rows.append(list(row.values()))
127
+
128
+ location_counts, (final_location, count) = compute_final_suggested_location(row_score)
129
+ summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
130
+ summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
131
+ summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
132
+ summary = "\n".join(summary_lines)
133
+
134
+ return rows, summary
135
+ # Gradio UI
136
+ with gr.Blocks() as interface:
137
+ gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
138
+ gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
139
+
140
+ with gr.Row():
141
+ accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
142
+ run_button = gr.Button("🔍 Submit and Classify")
143
+ reset_button = gr.Button("🔄 Reset")
144
+
145
+ status = gr.Markdown(visible=False)
146
+ headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
147
+ output_table = gr.Dataframe(headers=headers, interactive=False)
148
+ output_summary = gr.Markdown()
149
+
150
+ gr.Markdown("---")
151
+ gr.Markdown("### 💬 Feedback (required)")
152
+ q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
153
+ q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
154
+ contact = gr.Textbox(label="📧 Your email or institution (optional)")
155
+ submit_feedback = gr.Button("✅ Submit Feedback")
156
+ feedback_status = gr.Markdown()
157
+
158
+ def classify_with_loading(accession):
159
+ return gr.update(value="⏳ Please wait... processing...", visible=True)
160
+
161
+ def classify_main(accession):
162
+ table, summary = summarize_results(accession)
163
+ return table, summary, gr.update(visible=False)
164
+
165
+ def reset_fields():
166
+ return "", "", "", "", "", [], "", gr.update(visible=False)
167
+
168
+ run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
169
+ run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
170
+ submit_feedback.click(fn=store_feedback_to_google_sheets, inputs=[accession, q1, q2, contact], outputs=feedback_status)
171
+ reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
172
+
173
+ interface.launch(share=True)
174
+ =======
175
  # ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
176
 
177
  import gradio as gr
 
179
  import csv
180
  import os
181
  from functools import lru_cache
182
+ from mtdna_classifier import classify_sample_location
183
  import subprocess
184
  import json
185
+ import pandas as pd
186
+ import io
187
+ import re
188
+ import tempfile
189
+ import gspread
190
+ from oauth2client.service_account import ServiceAccountCredentials
191
 
192
  @lru_cache(maxsize=128)
193
  def classify_sample_location_cached(accession):
 
213
  return counts, (top_location, count)
214
 
215
  # Store feedback (with required fields)
 
 
216
 
217
  '''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
218
 
 
236
  except Exception as e:
237
  return f"❌ Error submitting feedback: {str(e)}"'''
238
 
 
 
 
 
 
239
  def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
240
  if not answer1.strip() or not answer2.strip():
241
  return "⚠️ Please answer both questions before submitting."
 
257
  except Exception as e:
258
  return f"❌ Error submitting feedback: {e}"
259
 
260
+ # helper function to extract accessions
261
+ def extract_accessions_from_input(file=None, raw_text=""):
262
+ print(f"RAW TEXT RECEIVED: {raw_text}")
263
+ accessions = []
264
+ seen = set()
265
+ if file:
266
+ try:
267
+ if file.name.endswith(".csv"):
268
+ df = pd.read_csv(file)
269
+ elif file.name.endswith(".xlsx"):
270
+ df = pd.read_excel(file)
271
+ else:
272
+ return [], "Unsupported file format. Please upload CSV or Excel."
273
+ for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
274
+ if acc not in seen:
275
+ accessions.append(acc)
276
+ seen.add(acc)
277
+ except Exception as e:
278
+ return [], f"Failed to read file: {e}"
279
+
280
+ if raw_text:
281
+ text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
282
+ for acc in text_ids:
283
+ if acc not in seen:
284
+ accessions.append(acc)
285
+ seen.add(acc)
286
+
287
+ return list(accessions), None
288
 
289
  def summarize_results(accession):
290
  try:
291
+ output, labelAncient_Modern, explain_label = classify_sample_location_cached(accession)
292
  print(output)
293
  except Exception as e:
294
+ return [], f"Error: {e}"
295
 
296
  if accession not in output:
297
+ return [], "Accession not found in results."
298
 
299
  isolate = next((k for k in output if k != accession), None)
300
  row_score = []
 
311
  haplogroup = content.get("haplogroup", "")
312
  inferred = content.get("inferred_location", "")
313
  context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
314
+
315
  row = {
316
  "Sample ID": sample_id_label,
317
  "Technique": technique,
 
331
  summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
332
  summary = "\n".join(summary_lines)
333
 
334
+ return rows, summary, labelAncient_Modern, explain_label
335
+
336
+ # save the batch input in excel file
337
+ def save_to_excel(all_rows, summary_text, flag_text, filename):
338
+ with pd.ExcelWriter(filename) as writer:
339
+ # Save table
340
+ df = pd.DataFrame(all_rows, columns=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"])
341
+ df.to_excel(writer, sheet_name="Detailed Results", index=False)
342
+
343
+ # Save summary
344
+ summary_df = pd.DataFrame({"Summary": [summary_text]})
345
+ summary_df.to_excel(writer, sheet_name="Summary", index=False)
346
+
347
+ # Save flag
348
+ flag_df = pd.DataFrame({"Flag": [flag_text]})
349
+ flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
350
+
351
+ # save the batch input in JSON file
352
+ def save_to_json(all_rows, summary_text, flag_text, filename):
353
+ output_dict = {
354
+ "Detailed_Results": all_rows,
355
+ "Summary_Text": summary_text,
356
+ "Ancient_Modern_Flag": flag_text
357
+ }
358
+ with open(filename, "w") as f:
359
+ json.dump(output_dict, f, indent=2)
360
+
361
+ # save the batch input in Text file
362
+ def save_to_txt(all_rows, summary_text, flag_text, filename):
363
+ with open(filename, "w") as f:
364
+ f.write("=== Detailed Results ===\n")
365
+ for row in all_rows:
366
+ f.write(", ".join(str(x) for x in row) + "\n")
367
+
368
+ f.write("\n=== Summary ===\n")
369
+ f.write(summary_text + "\n")
370
+
371
+ f.write("\n=== Ancient/Modern Flag ===\n")
372
+ f.write(flag_text + "\n")
373
+
374
+ def save_batch_output(all_rows, summary_text, flag_text, output_type):
375
+ tmp_dir = tempfile.mkdtemp()
376
+
377
+ if output_type == "Excel":
378
+ file_path = f"{tmp_dir}/batch_output.xlsx"
379
+ save_to_excel(all_rows, summary_text, flag_text, file_path)
380
+ elif output_type == "JSON":
381
+ file_path = f"{tmp_dir}/batch_output.json"
382
+ save_to_json(all_rows, summary_text, flag_text, file_path)
383
+ elif output_type == "TXT":
384
+ file_path = f"{tmp_dir}/batch_output.txt"
385
+ save_to_txt(all_rows, summary_text, flag_text, file_path)
386
+ else:
387
+ return None # invalid option
388
+
389
+ return file_path
390
+
391
+ # run the batch
392
+ def summarize_batch(file=None, raw_text=""):
393
+ accessions, error = extract_accessions_from_input(file, raw_text)
394
+ if error:
395
+ return [], "", "", f"Error: {error}"
396
+
397
+ all_rows = []
398
+ all_summaries = []
399
+ all_flags = []
400
+
401
+ for acc in accessions:
402
+ try:
403
+ rows, summary, label, explain = summarize_results(acc)
404
+ all_rows.extend(rows)
405
+ all_summaries.append(f"**{acc}**\n{summary}")
406
+ all_flags.append(f"**{acc}**: {label}\n_Explanation:_ {explain}")
407
+ except Exception as e:
408
+ all_summaries.append(f"**{acc}**: Failed - {e}")
409
+
410
+ summary_text = "\n\n---\n\n".join(all_summaries)
411
+ flag_text = "\n\n".join(all_flags)
412
+
413
+ return all_rows, summary_text, flag_text, gr.update(visible=False)
414
+
415
  # Gradio UI
416
  with gr.Blocks() as interface:
417
  gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
418
+
419
+ inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
420
+
421
+ with gr.Group() as single_input_group:
422
+ single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
423
+
424
+ with gr.Group(visible=False) as batch_input_group:
425
+ raw_text = gr.Textbox(label="🧬 Paste Accession Numbers")
426
+ file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
427
+ print(raw_text)
428
+ # Make the file box smaller
429
+ gr.HTML('<style>#file-upload-box { width: 200px; }</style>')
430
 
431
  with gr.Row():
 
432
  run_button = gr.Button("🔍 Submit and Classify")
433
  reset_button = gr.Button("🔄 Reset")
434
 
435
  status = gr.Markdown(visible=False)
436
+
437
+ with gr.Group(visible=False) as results_group:
438
+ with gr.Row():
439
+ with gr.Column():
440
+ output_summary = gr.Markdown()
441
+ with gr.Column():
442
+ output_flag = gr.Markdown()
443
+
444
+ gr.Markdown("---")
445
+ output_table = gr.Dataframe(
446
+ headers=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"],
447
+ interactive=False,
448
+ row_count=(5, "dynamic")
449
+ )
450
+
451
+ with gr.Row():
452
+ output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
453
+ download_button = gr.Button("⬇️ Download Output")
454
+ download_file = gr.File(label="Download File Here")
455
+
456
+ gr.Markdown("---")
457
+
458
+ gr.Markdown("### 💬 Feedback (required)")
459
+ q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
460
+ q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
461
+ contact = gr.Textbox(label="📧 Your email or institution (optional)")
462
+ submit_feedback = gr.Button("✅ Submit Feedback")
463
+ feedback_status = gr.Markdown()
464
+
465
+ # Functions
466
+
467
+ def toggle_input_mode(mode):
468
+ if mode == "Single Accession":
469
+ return gr.update(visible=True), gr.update(visible=False)
470
+ else:
471
+ return gr.update(visible=False), gr.update(visible=True)
472
+
473
+ def classify_with_loading():
474
  return gr.update(value="⏳ Please wait... processing...", visible=True)
475
 
476
+ def classify_dynamic(single_accession, file, text, mode):
477
+ print(f"MODE: {mode} | RAW TEXT: {text}")
478
+ if mode == "Single Accession":
479
+ return classify_main(single_accession)
480
+ else:
481
+ return summarize_batch(file, text)
482
+
483
  def classify_main(accession):
484
+ table, summary, labelAncient_Modern, explain_label = summarize_results(accession)
485
+ flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
486
+ return (
487
+ table,
488
+ summary,
489
+ flag_output,
490
+ gr.update(visible=True),
491
+ gr.update(visible=False)
492
+ )
493
 
494
  def reset_fields():
495
+ return (
496
+ gr.update(value=""), # single_accession
497
+ gr.update(value=""), # raw_text
498
+ gr.update(value=None), # file_upload
499
+ gr.update(value="Single Accession"), # inputMode
500
+ gr.update(value=[], visible=True), # output_table
501
+ gr.update(value="", visible=True), # output_summary
502
+ gr.update(value="", visible=True), # output_flag
503
+ gr.update(visible=False), # status
504
+ gr.update(visible=False) # results_group
505
+ )
506
+
507
+ inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
508
+ run_button.click(fn=classify_with_loading, inputs=[], outputs=status)
509
+ run_button.click(
510
+ fn=classify_dynamic,
511
+ inputs=[single_accession, file_upload, raw_text, inputMode],
512
+ outputs=[output_table, output_summary, output_flag, results_group, status]
513
+ )
514
+ reset_button.click(
515
+ fn=reset_fields,
516
+ inputs=[],
517
+ outputs=[
518
+ single_accession, raw_text, file_upload, inputMode,
519
+ output_table, output_summary, output_flag,
520
+ status, results_group
521
+ ]
522
+ )
523
+
524
+ download_button.click(
525
+ save_batch_output, [output_table, output_summary, output_flag, output_type], download_file
526
+ )
527
+ submit_feedback.click(
528
+ fn=store_feedback_to_google_sheets, inputs=[single_accession, q1, q2, contact], outputs=feedback_status
529
+ )
530
 
531
  interface.launch(share=True)
532
+ >>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI before moving to UpdateAppUI)
data/user_fb/feedback_mtdna.xlsx ADDED
Binary file (5.93 kB). View file
 
env.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ name: mtDNA
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - python=3.10
6
+ - pip
7
+ - pip:
8
+ - -r requirements.txt
installedAndUsedRequirements.txt ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python_version==3.11.12
2
+ absl-py==1.4.0
3
+ accelerate==1.6.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.15
7
+ aiosignal==1.3.2
8
+ alabaster==1.0.0
9
+ albucore==0.0.24
10
+ albumentations==2.0.6
11
+ ale-py==0.11.0
12
+ altair==5.5.0
13
+ annotated-types==0.7.0
14
+ anyio==4.9.0
15
+ argon2-cffi==23.1.0
16
+ argon2-cffi-bindings==21.2.0
17
+ array_record==0.7.2
18
+ arviz==0.21.0
19
+ astropy==7.0.1
20
+ astropy-iers-data==0.2025.4.28.0.37.27
21
+ astunparse==1.6.3
22
+ atpublic==5.1
23
+ attrs==25.3.0
24
+ audioread==3.0.1
25
+ autograd==1.7.0
26
+ babel==2.17.0
27
+ backcall==0.2.0
28
+ backports.tarfile==1.2.0
29
+ beautifulsoup4==4.13.4
30
+ betterproto==2.0.0b6
31
+ bigframes==2.1.0
32
+ bigquery-magics==0.9.0
33
+ biopython==1.85
34
+ bitarray==3.4.0
35
+ bleach==6.2.0
36
+ blinker==1.9.0
37
+ blis==1.2.1
38
+ blosc2==3.3.2
39
+ bokeh==3.7.2
40
+ Bottleneck==1.4.2
41
+ bqplot==0.12.44
42
+ branca==0.8.1
43
+ bs4==0.0.2
44
+ build==1.2.2.post1
45
+ CacheControl==0.14.3
46
+ cachetools==5.5.2
47
+ catalogue==2.0.10
48
+ certifi==2025.4.26
49
+ cffi==1.17.1
50
+ chardet==5.2.0
51
+ charset-normalizer==3.4.1
52
+ chex==0.1.89
53
+ clarabel==0.10.0
54
+ click==8.1.8
55
+ cloudpathlib==0.21.0
56
+ cloudpickle==3.1.1
57
+ cmake==3.31.6
58
+ cmdstanpy==1.2.5
59
+ colorcet==3.1.0
60
+ colorlover==0.3.0
61
+ colour==0.1.5
62
+ community==1.0.0b1
63
+ confection==0.1.5
64
+ cons==0.4.6
65
+ contourpy==1.3.2
66
+ cramjam==2.10.0
67
+ cryptography==43.0.3
68
+ cuda-python==12.6.2.post1
69
+ cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
70
+ cudf-polars-cu12==25.2.2
71
+ cufflinks==0.17.3
72
+ cuml-cu12==25.2.1
73
+ cupy-cuda12x==13.3.0
74
+ cuvs-cu12==25.2.1
75
+ cvxopt==1.3.2
76
+ cvxpy==1.6.5
77
+ cycler==0.12.1
78
+ cyipopt==1.5.0
79
+ cymem==2.0.11
80
+ Cython==3.0.12
81
+ dask==2024.12.1
82
+ dask-cuda==25.2.0
83
+ dask-cudf-cu12==25.2.2
84
+ dask-expr==1.1.21
85
+ dataproc-spark-connect==0.7.2
86
+ datascience==0.17.6
87
+ db-dtypes==1.4.2
88
+ dbus-python==1.2.18
89
+ debugpy==1.8.0
90
+ decorator==4.4.2
91
+ defusedxml==0.7.1
92
+ Deprecated==1.2.18
93
+ diffusers==0.33.1
94
+ distributed==2024.12.1
95
+ distributed-ucxx-cu12==0.42.0
96
+ distro==1.9.0
97
+ dlib==19.24.6
98
+ dm-tree==0.1.9
99
+ docker-pycreds==0.4.0
100
+ docstring_parser==0.16
101
+ docutils==0.21.2
102
+ dopamine_rl==4.1.2
103
+ duckdb==1.2.2
104
+ earthengine-api==1.5.13
105
+ easydict==1.13
106
+ editdistance==0.8.1
107
+ eerepr==0.1.1
108
+ einops==0.8.1
109
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
110
+ entrypoints==0.4
111
+ et_xmlfile==2.0.0
112
+ etils==1.12.2
113
+ etuples==0.3.9
114
+ Farama-Notifications==0.0.4
115
+ fastai==2.7.19
116
+ fastapi==0.115.12
117
+ fastcore==1.7.29
118
+ fastdownload==0.0.7
119
+ fastjsonschema==2.21.1
120
+ fastprogress==1.0.3
121
+ fastrlock==0.8.3
122
+ ffmpy==0.5.0
123
+ filelock==3.18.0
124
+ firebase-admin==6.8.0
125
+ Flask==3.1.0
126
+ flatbuffers==25.2.10
127
+ flax==0.10.6
128
+ folium==0.19.5
129
+ fonttools==4.57.0
130
+ frozendict==2.4.6
131
+ frozenlist==1.6.0
132
+ fsspec==2025.3.2
133
+ future==1.0.0
134
+ gast==0.6.0
135
+ gcsfs==2025.3.2
136
+ GDAL==3.6.4
137
+ gdown==5.2.0
138
+ geemap==0.35.3
139
+ gensim==4.3.3
140
+ geocoder==1.38.1
141
+ geographiclib==2.0
142
+ geopandas==1.0.1
143
+ geopy==2.4.1
144
+ gin-config==0.5.0
145
+ gitdb==4.0.12
146
+ GitPython==3.1.44
147
+ glob2==0.7
148
+ google==2.0.3
149
+ google-ai-generativelanguage==0.6.15
150
+ google-api-core==2.24.2
151
+ google-api-python-client==2.169.0
152
+ google-auth==2.38.0
153
+ google-auth-httplib2==0.2.0
154
+ google-auth-oauthlib==1.2.2
155
+ google-cloud-aiplatform==1.91.0
156
+ google-cloud-bigquery==3.31.0
157
+ google-cloud-bigquery-connection==1.18.2
158
+ google-cloud-bigquery-storage==2.31.0
159
+ google-cloud-bigtable==2.30.1
160
+ google-cloud-core==2.4.3
161
+ google-cloud-dataproc==5.18.1
162
+ google-cloud-datastore==2.21.0
163
+ google-cloud-firestore==2.20.2
164
+ google-cloud-functions==1.20.3
165
+ google-cloud-iam==2.19.0
166
+ google-cloud-language==2.17.1
167
+ google-cloud-pubsub==2.25.0
168
+ google-cloud-resource-manager==1.14.2
169
+ google-cloud-spanner==3.54.0
170
+ google-cloud-storage==2.19.0
171
+ google-cloud-translate==3.20.2
172
+ google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
173
+ google-crc32c==1.7.1
174
+ google-genai==1.13.0
175
+ google-generativeai==0.8.5
176
+ google-pasta==0.2.0
177
+ google-resumable-media==2.7.2
178
+ googleapis-common-protos==1.70.0
179
+ googledrivedownloader==1.1.0
180
+ gradio==5.29.0
181
+ gradio_client==1.10.0
182
+ graphviz==0.20.3
183
+ greenlet==3.2.1
184
+ groovy==0.1.2
185
+ grpc-google-iam-v1==0.14.2
186
+ grpc-interceptor==0.15.4
187
+ grpcio==1.71.0
188
+ grpcio-status==1.71.0
189
+ grpclib==0.4.7
190
+ gspread==6.2.0
191
+ gspread-dataframe==4.0.0
192
+ gym==0.25.2
193
+ gym-notices==0.0.8
194
+ gymnasium==1.1.1
195
+ h11==0.16.0
196
+ h2==4.2.0
197
+ h5netcdf==1.6.1
198
+ h5py==3.13.0
199
+ hdbscan==0.8.40
200
+ highspy==1.10.0
201
+ holidays==0.71
202
+ holoviews==1.20.2
203
+ hpack==4.1.0
204
+ html5lib==1.1
205
+ httpcore==1.0.9
206
+ httpimport==1.4.1
207
+ httplib2==0.22.0
208
+ httpx==0.28.1
209
+ huggingface-hub==0.30.2
210
+ humanize==4.12.3
211
+ hyperframe==6.1.0
212
+ hyperopt==0.2.7
213
+ ibis-framework==9.5.0
214
+ idna==3.10
215
+ imageio==2.37.0
216
+ imageio-ffmpeg==0.6.0
217
+ imagesize==1.4.1
218
+ imbalanced-learn==0.13.0
219
+ immutabledict==4.2.1
220
+ importlib_metadata==8.7.0
221
+ importlib_resources==6.5.2
222
+ imutils==0.5.4
223
+ inflect==7.5.0
224
+ iniconfig==2.1.0
225
+ intel-cmplr-lib-ur==2025.1.1
226
+ intel-openmp==2025.1.1
227
+ ipyevents==2.0.2
228
+ ipyfilechooser==0.6.0
229
+ ipykernel==6.17.1
230
+ ipyleaflet==0.19.2
231
+ ipyparallel==8.8.0
232
+ ipython==7.34.0
233
+ ipython-genutils==0.2.0
234
+ ipython-sql==0.5.0
235
+ ipytree==0.2.2
236
+ ipywidgets==7.7.1
237
+ itsdangerous==2.2.0
238
+ jaraco.classes==3.4.0
239
+ jaraco.context==6.0.1
240
+ jaraco.functools==4.1.0
241
+ jax==0.5.2
242
+ jax-cuda12-pjrt==0.5.1
243
+ jax-cuda12-plugin==0.5.1
244
+ jaxlib==0.5.1
245
+ jeepney==0.9.0
246
+ jieba==0.42.1
247
+ Jinja2==3.1.6
248
+ jiter==0.9.0
249
+ joblib==1.4.2
250
+ jsonpatch==1.33
251
+ jsonpickle==4.0.5
252
+ jsonpointer==3.0.0
253
+ jsonschema==4.23.0
254
+ jsonschema-specifications==2025.4.1
255
+ jupyter-client==6.1.12
256
+ jupyter-console==6.1.0
257
+ jupyter-leaflet==0.19.2
258
+ jupyter-server==1.16.0
259
+ jupyter_core==5.7.2
260
+ jupyter_kernel_gateway @ git+https://github.com/googlecolab/kernel_gateway@b134e9945df25c2dcb98ade9129399be10788671
261
+ jupyterlab_pygments==0.3.0
262
+ jupyterlab_widgets==3.0.14
263
+ kaggle==1.7.4.2
264
+ kagglehub==0.3.12
265
+ keras==3.8.0
266
+ keras-hub==0.18.1
267
+ keras-nlp==0.18.1
268
+ keyring==25.6.0
269
+ keyrings.google-artifactregistry-auth==1.1.2
270
+ kiwisolver==1.4.8
271
+ langchain==0.3.24
272
+ langchain-core==0.3.56
273
+ langchain-text-splitters==0.3.8
274
+ langcodes==3.5.0
275
+ langsmith==0.3.39
276
+ language_data==1.3.0
277
+ launchpadlib==1.10.16
278
+ lazr.restfulclient==0.14.4
279
+ lazr.uri==1.0.6
280
+ lazy_loader==0.4
281
+ libclang==18.1.1
282
+ libcudf-cu12 @ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-25.2.1-py3-none-manylinux_2_28_x86_64.whl
283
+ libcugraph-cu12==25.2.0
284
+ libcuml-cu12==25.2.1
285
+ libcuvs-cu12==25.2.1
286
+ libkvikio-cu12==25.2.1
287
+ libraft-cu12==25.2.0
288
+ librosa==0.11.0
289
+ libucx-cu12==1.18.1
290
+ libucxx-cu12==0.42.0
291
+ lightgbm @ file:///tmp/lightgbm/LightGBM/dist/lightgbm-4.5.0-py3-none-linux_x86_64.whl
292
+ linkify-it-py==2.0.3
293
+ llvmlite==0.43.0
294
+ locket==1.0.0
295
+ logical-unification==0.4.6
296
+ lxml==5.4.0
297
+ Mako==1.1.3
298
+ marisa-trie==1.2.1
299
+ Markdown==3.8
300
+ markdown-it-py==3.0.0
301
+ MarkupSafe==3.0.2
302
+ matplotlib==3.10.0
303
+ matplotlib-inline==0.1.7
304
+ matplotlib-venn==1.1.2
305
+ mdit-py-plugins==0.4.2
306
+ mdurl==0.1.2
307
+ miniKanren==1.0.3
308
+ missingno==0.5.2
309
+ mistune==3.1.3
310
+ mizani==0.13.3
311
+ mkl==2025.0.1
312
+ ml-dtypes==0.4.1
313
+ mlxtend==0.23.4
314
+ more-itertools==10.7.0
315
+ moviepy==1.0.3
316
+ mpmath==1.3.0
317
+ msgpack==1.1.0
318
+ multidict==6.4.3
319
+ multipledispatch==1.0.0
320
+ multitasking==0.0.11
321
+ murmurhash==1.0.12
322
+ music21==9.3.0
323
+ namex==0.0.9
324
+ narwhals==1.37.1
325
+ natsort==8.4.0
326
+ nbclassic==1.3.0
327
+ nbclient==0.10.2
328
+ nbconvert==7.16.6
329
+ nbformat==5.10.4
330
+ ndindex==1.9.2
331
+ nest-asyncio==1.6.0
332
+ networkx==3.4.2
333
+ nibabel==5.3.2
334
+ nltk==3.9.1
335
+ notebook==6.5.7
336
+ notebook_shim==0.2.4
337
+ numba==0.60.0
338
+ numba-cuda==0.2.0
339
+ numexpr==2.10.2
340
+ numpy==1.25.2
341
+ nvidia-cublas-cu12==12.4.5.8
342
+ nvidia-cuda-cupti-cu12==12.4.127
343
+ nvidia-cuda-nvcc-cu12==12.5.82
344
+ nvidia-cuda-nvrtc-cu12==12.4.127
345
+ nvidia-cuda-runtime-cu12==12.4.127
346
+ nvidia-cudnn-cu12==9.1.0.70
347
+ nvidia-cufft-cu12==11.2.1.3
348
+ nvidia-curand-cu12==10.3.5.147
349
+ nvidia-cusolver-cu12==11.6.1.9
350
+ nvidia-cusparse-cu12==12.3.1.170
351
+ nvidia-cusparselt-cu12==0.6.2
352
+ nvidia-ml-py==12.570.86
353
+ nvidia-nccl-cu12==2.21.5
354
+ nvidia-nvcomp-cu12==4.2.0.11
355
+ nvidia-nvjitlink-cu12==12.4.127
356
+ nvidia-nvtx-cu12==12.4.127
357
+ nvtx==0.2.11
358
+ nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-25.2.0-py3-none-any.whl
359
+ oauth2client==4.1.3
360
+ oauthlib==3.2.2
361
+ openai==1.76.2
362
+ opencv-contrib-python==4.11.0.86
363
+ opencv-python==4.11.0.86
364
+ opencv-python-headless==4.11.0.86
365
+ openpyxl==3.1.5
366
+ opentelemetry-api==1.16.0
367
+ opentelemetry-sdk==1.16.0
368
+ opentelemetry-semantic-conventions==0.37b0
369
+ opt_einsum==3.4.0
370
+ optax==0.2.4
371
+ optree==0.15.0
372
+ orbax-checkpoint==0.11.13
373
+ orjson==3.10.18
374
+ osqp==1.0.3
375
+ packaging==24.2
376
+ pandas==2.2.2
377
+ pandas-datareader==0.10.0
378
+ pandas-gbq==0.28.0
379
+ pandas-stubs==2.2.2.240909
380
+ pandocfilters==1.5.1
381
+ panel==1.6.3
382
+ param==2.2.0
383
+ parso==0.8.4
384
+ parsy==2.1
385
+ partd==1.4.2
386
+ pathlib==1.0.1
387
+ patsy==1.0.1
388
+ pdfreader==0.1.15
389
+ peewee==3.18.1
390
+ peft==0.15.2
391
+ pexpect==4.9.0
392
+ pickleshare==0.7.5
393
+ pillow==11.2.1
394
+ platformdirs==4.3.7
395
+ plotly==5.24.1
396
+ plotnine==0.14.5
397
+ pluggy==1.5.0
398
+ plum-dispatch==1.7.4
399
+ ply==3.11
400
+ polars==1.21.0
401
+ pooch==1.8.2
402
+ portpicker==1.5.2
403
+ preshed==3.0.9
404
+ prettytable==3.16.0
405
+ proglog==0.1.11
406
+ progressbar2==4.5.0
407
+ prometheus_client==0.21.1
408
+ promise==2.3
409
+ prompt_toolkit==3.0.51
410
+ propcache==0.3.1
411
+ prophet==1.1.6
412
+ proto-plus==1.26.1
413
+ protobuf==5.29.4
414
+ psutil==5.9.5
415
+ psycopg2==2.9.10
416
+ ptyprocess==0.7.0
417
+ py-cpuinfo==9.0.0
418
+ py4j==0.10.9.7
419
+ pyarrow==18.1.0
420
+ pyasn1==0.6.1
421
+ pyasn1_modules==0.4.2
422
+ pycairo==1.28.0
423
+ pycocotools==2.0.8
424
+ pycparser==2.22
425
+ pycryptodome==3.22.0
426
+ pydantic==2.11.4
427
+ pydantic_core==2.33.2
428
+ pydata-google-auth==1.9.1
429
+ pydot==3.0.4
430
+ pydotplus==2.0.2
431
+ PyDrive==1.3.1
432
+ PyDrive2==1.21.3
433
+ pydub==0.25.1
434
+ pyerfa==2.0.1.5
435
+ pygame==2.6.1
436
+ pygit2==1.18.0
437
+ Pygments==2.19.1
438
+ PyGObject==3.42.0
439
+ PyJWT==2.10.1
440
+ pylibcudf-cu12 @ https://pypi.nvidia.com/pylibcudf-cu12/pylibcudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
441
+ pylibcugraph-cu12==25.2.0
442
+ pylibraft-cu12==25.2.0
443
+ pymc==5.22.0
444
+ PyMuPDF==1.25.5
445
+ pymystem3==0.2.0
446
+ pynndescent==0.5.13
447
+ pynvjitlink-cu12==0.5.2
448
+ pynvml==12.0.0
449
+ pyogrio==0.10.0
450
+ pyomo==6.9.2
451
+ PyOpenGL==3.1.9
452
+ pyOpenSSL==24.2.1
453
+ pyparsing==3.2.3
454
+ pyperclip==1.9.0
455
+ pyproj==3.7.1
456
+ pyproject_hooks==1.2.0
457
+ pyshp==2.3.1
458
+ PySocks==1.7.1
459
+ pyspark==3.5.1
460
+ pytensor==2.30.3
461
+ pytest==8.3.5
462
+ python-apt==0.0.0
463
+ python-box==7.3.2
464
+ python-dateutil==2.9.0.post0
465
+ python-louvain==0.16
466
+ python-multipart==0.0.20
467
+ python-slugify==8.0.4
468
+ python-snappy==0.7.3
469
+ python-utils==3.9.1
470
+ pytz==2025.2
471
+ pyviz_comms==3.0.4
472
+ PyYAML==6.0.2
473
+ pyzmq==24.0.1
474
+ raft-dask-cu12==25.2.0
475
+ RapidFuzz==3.13.0
476
+ rapids-dask-dependency==25.2.0
477
+ ratelim==0.1.6
478
+ referencing==0.36.2
479
+ regex==2024.11.6
480
+ requests==2.32.3
481
+ requests-oauthlib==2.0.0
482
+ requests-toolbelt==1.0.0
483
+ requirements-parser==0.9.0
484
+ rich==13.9.4
485
+ rmm-cu12==25.2.0
486
+ roman-numerals-py==3.1.0
487
+ rpds-py==0.24.0
488
+ rpy2==3.5.17
489
+ rsa==4.9.1
490
+ ruff==0.11.9
491
+ safehttpx==0.1.6
492
+ safetensors==0.5.3
493
+ scikit-image==0.25.2
494
+ scikit-learn==1.6.1
495
+ scipy==1.13.1
496
+ scooby==0.10.1
497
+ scs==3.2.7.post2
498
+ seaborn==0.13.2
499
+ SecretStorage==3.3.3
500
+ semantic-version==2.10.0
501
+ Send2Trash==1.8.3
502
+ sentence-transformers==3.4.1
503
+ sentencepiece==0.2.0
504
+ sentry-sdk==2.27.0
505
+ setproctitle==1.3.6
506
+ shap==0.47.2
507
+ shapely==2.1.0
508
+ shellingham==1.5.4
509
+ simple-parsing==0.1.7
510
+ simplejson==3.20.1
511
+ simsimd==6.2.1
512
+ six==1.17.0
513
+ sklearn-compat==0.1.3
514
+ sklearn-pandas==2.2.0
515
+ slicer==0.0.8
516
+ smart-open==7.1.0
517
+ smmap==5.0.2
518
+ sniffio==1.3.1
519
+ snowballstemmer==2.2.0
520
+ sortedcontainers==2.4.0
521
+ soundfile==0.13.1
522
+ soupsieve==2.7
523
+ soxr==0.5.0.post1
524
+ spacy==3.8.5
525
+ spacy-legacy==3.0.12
526
+ spacy-loggers==1.0.5
527
+ spacy-lookups-data==1.0.5
528
+ spanner-graph-notebook==1.1.6
529
+ Sphinx==8.2.3
530
+ sphinxcontrib-applehelp==2.0.0
531
+ sphinxcontrib-devhelp==2.0.0
532
+ sphinxcontrib-htmlhelp==2.1.0
533
+ sphinxcontrib-jsmath==1.0.1
534
+ sphinxcontrib-qthelp==2.0.0
535
+ sphinxcontrib-serializinghtml==2.0.0
536
+ spire-doc==13.4.6
537
+ Spire.Xls==14.12.0
538
+ SQLAlchemy==2.0.40
539
+ sqlglot==25.20.2
540
+ sqlparse==0.5.3
541
+ srsly==2.5.1
542
+ stanio==0.5.1
543
+ starlette==0.46.2
544
+ statsmodels==0.14.4
545
+ stringzilla==3.12.5
546
+ sympy==1.13.1
547
+ tables==3.10.2
548
+ tabula-py==2.10.0
549
+ tabulate==0.9.0
550
+ tbb==2022.1.0
551
+ tblib==3.1.0
552
+ tcmlib==1.3.0
553
+ tenacity==9.1.2
554
+ tensorboard==2.18.0
555
+ tensorboard-data-server==0.7.2
556
+ tensorflow==2.18.0
557
+ tensorflow-datasets==4.9.8
558
+ tensorflow-hub==0.16.1
559
+ tensorflow-io-gcs-filesystem==0.37.1
560
+ tensorflow-metadata==1.17.1
561
+ tensorflow-probability==0.25.0
562
+ tensorflow-text==2.18.1
563
+ tensorflow_decision_forests==1.11.0
564
+ tensorstore==0.1.74
565
+ termcolor==3.1.0
566
+ terminado==0.18.1
567
+ text-unidecode==1.3
568
+ textblob==0.19.0
569
+ tf-slim==1.1.0
570
+ tf_keras==2.18.0
571
+ thefuzz==0.22.1
572
+ thinc==8.3.4
573
+ threadpoolctl==3.6.0
574
+ tifffile==2025.3.30
575
+ timm==1.0.15
576
+ tinycss2==1.4.0
577
+ tokenizers==0.21.1
578
+ toml==0.10.2
579
+ tomlkit==0.13.2
580
+ toolz==0.12.1
581
+ torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
582
+ torchaudio @ https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
583
+ torchsummary==1.5.1
584
+ torchvision @ https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl
585
+ tornado==6.4.2
586
+ tqdm==4.67.1
587
+ traitlets==5.7.1
588
+ traittypes==0.2.1
589
+ transformers==4.51.3
590
+ treelite==4.4.1
591
+ treescope==0.1.9
592
+ triton==3.2.0
593
+ tweepy==4.15.0
594
+ typeguard==4.4.2
595
+ typer==0.15.3
596
+ types-pytz==2025.2.0.20250326
597
+ types-setuptools==80.3.0.20250505
598
+ typing-inspection==0.4.0
599
+ typing_extensions==4.13.2
600
+ tzdata==2025.2
601
+ tzlocal==5.3.1
602
+ uc-micro-py==1.0.3
603
+ ucx-py-cu12==0.42.0
604
+ ucxx-cu12==0.42.0
605
+ umap-learn==0.5.7
606
+ umf==0.10.0
607
+ uritemplate==4.1.1
608
+ urllib3==2.4.0
609
+ uvicorn==0.34.2
610
+ vega-datasets==0.9.0
611
+ wadllib==1.3.6
612
+ wandb==0.19.10
613
+ wasabi==1.1.3
614
+ wcwidth==0.2.13
615
+ weasel==0.4.1
616
+ webcolors==24.11.1
617
+ webencodings==0.5.1
618
+ websocket-client==1.8.0
619
+ websockets==15.0.1
620
+ Werkzeug==3.1.3
621
+ widgetsnbextension==3.6.10
622
+ wordcloud==1.9.4
623
+ wordsegment==1.3.1
624
+ wrapt==1.17.2
625
+ wurlitzer==3.1.1
626
+ xarray==2025.3.1
627
+ xarray-einstats==0.8.0
628
+ xgboost==2.1.4
629
+ xlrd==2.0.1
630
+ xyzservices==2025.4.0
631
+ yarl==1.20.0
632
+ ydf==0.11.0
633
+ yellowbrick==1.5
634
+ yfinance==0.2.57
635
+ zict==3.0.0
636
+ zipp==3.21.0
637
+ zstandard==0.23.0
mtdna_backend.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from collections import Counter
3
+ import csv
4
+ import os
5
+ from functools import lru_cache
6
+ from mtdna_classifier import classify_sample_location
7
+ import subprocess
8
+ import json
9
+ import pandas as pd
10
+ import io
11
+ import re
12
+ import tempfile
13
+ import gspread
14
+ from oauth2client.service_account import ServiceAccountCredentials
15
+ from io import StringIO
16
+
17
+ @lru_cache(maxsize=128)
18
+ def classify_sample_location_cached(accession):
19
+ return classify_sample_location(accession)
20
+
21
+ # Count and suggest final location
22
+ def compute_final_suggested_location(rows):
23
+ candidates = [
24
+ row.get("Predicted Location", "").strip()
25
+ for row in rows
26
+ if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
27
+ ] + [
28
+ row.get("Inferred Region", "").strip()
29
+ for row in rows
30
+ if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
31
+ ]
32
+
33
+ if not candidates:
34
+ return Counter(), ("Unknown", 0)
35
+ # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
36
+ tokens = []
37
+ for item in candidates:
38
+ # Split by comma, whitespace, and newlines
39
+ parts = re.split(r'[\s,]+', item)
40
+ tokens.extend(parts)
41
+
42
+ # Step 2: Clean and normalize tokens
43
+ tokens = [word.strip() for word in tokens if word.strip().isalpha()] # Keep only alphabetic tokens
44
+
45
+ # Step 3: Count
46
+ counts = Counter(tokens)
47
+
48
+ # Step 4: Get most common
49
+ top_location, count = counts.most_common(1)[0]
50
+ return counts, (top_location, count)
51
+
52
+ # Store feedback (with required fields)
53
+
54
+ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
55
+ if not answer1.strip() or not answer2.strip():
56
+ return "⚠️ Please answer both questions before submitting."
57
+
58
+ try:
59
+ # ✅ Step: Load credentials from Hugging Face secret
60
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
61
+ scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
62
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
63
+
64
+ # Connect to Google Sheet
65
+ client = gspread.authorize(creds)
66
+ sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
67
+
68
+ # Append feedback
69
+ sheet.append_row([accession, answer1, answer2, contact])
70
+ return "✅ Feedback submitted. Thank you!"
71
+
72
+ except Exception as e:
73
+ return f"❌ Error submitting feedback: {e}"
74
+
75
+ # helper function to extract accessions
76
+ def extract_accessions_from_input(file=None, raw_text=""):
77
+ print(f"RAW TEXT RECEIVED: {raw_text}")
78
+ accessions = []
79
+ seen = set()
80
+ if file:
81
+ try:
82
+ if file.name.endswith(".csv"):
83
+ df = pd.read_csv(file)
84
+ elif file.name.endswith(".xlsx"):
85
+ df = pd.read_excel(file)
86
+ else:
87
+ return [], "Unsupported file format. Please upload CSV or Excel."
88
+ for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
89
+ if acc not in seen:
90
+ accessions.append(acc)
91
+ seen.add(acc)
92
+ except Exception as e:
93
+ return [], f"Failed to read file: {e}"
94
+
95
+ if raw_text:
96
+ text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
97
+ for acc in text_ids:
98
+ if acc not in seen:
99
+ accessions.append(acc)
100
+ seen.add(acc)
101
+
102
+ return list(accessions), None
103
+
104
+ def summarize_results(accession):
105
+ try:
106
+ output, labelAncient_Modern, explain_label = classify_sample_location_cached(accession)
107
+ #print(output)
108
+ except Exception as e:
109
+ return [], f"Error: {e}", f"Error: {e}", f"Error: {e}"
110
+
111
+ if accession not in output:
112
+ return [], "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
113
+
114
+ isolate = next((k for k in output if k != accession), None)
115
+ row_score = []
116
+ rows = []
117
+
118
+ for key in [accession, isolate]:
119
+ if key not in output:
120
+ continue
121
+ sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
122
+ for section, techniques in output[key].items():
123
+ for technique, content in techniques.items():
124
+ source = content.get("source", "")
125
+ predicted = content.get("predicted_location", "")
126
+ haplogroup = content.get("haplogroup", "")
127
+ inferred = content.get("inferred_location", "")
128
+ context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
129
+
130
+ row = {
131
+ "Sample ID": sample_id_label,
132
+ "Technique": technique,
133
+ "Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
134
+ "Predicted Location": "" if technique == "haplogroup" else predicted,
135
+ "Haplogroup": haplogroup if technique == "haplogroup" else "",
136
+ "Inferred Region": inferred if technique == "haplogroup" else "",
137
+ "Context Snippet": context
138
+ }
139
+
140
+ row_score.append(row)
141
+ rows.append(list(row.values()))
142
+
143
+ location_counts, (final_location, count) = compute_final_suggested_location(row_score)
144
+ summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
145
+ summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
146
+ summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
147
+ summary = "\n".join(summary_lines)
148
+ return rows, summary, labelAncient_Modern, explain_label
149
+
150
+ # save the batch input in excel file
151
+ def save_to_excel(all_rows, summary_text, flag_text, filename):
152
+ with pd.ExcelWriter(filename) as writer:
153
+ # Save table
154
+ df = pd.DataFrame(all_rows, columns=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"])
155
+ df.to_excel(writer, sheet_name="Detailed Results", index=False)
156
+
157
+ # Save summary
158
+ summary_df = pd.DataFrame({"Summary": [summary_text]})
159
+ summary_df.to_excel(writer, sheet_name="Summary", index=False)
160
+
161
+ # Save flag
162
+ flag_df = pd.DataFrame({"Flag": [flag_text]})
163
+ flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
164
+
165
+ # save the batch input in JSON file
166
+ def save_to_json(all_rows, summary_text, flag_text, filename):
167
+ output_dict = {
168
+ "Detailed_Results": all_rows, # <-- make sure this is a plain list, not a DataFrame
169
+ "Summary_Text": summary_text,
170
+ "Ancient_Modern_Flag": flag_text
171
+ }
172
+
173
+ # If all_rows is a DataFrame, convert it
174
+ if isinstance(all_rows, pd.DataFrame):
175
+ output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
176
+
177
+ with open(filename, "w") as external_file:
178
+ json.dump(output_dict, external_file, indent=2)
179
+
180
+ # save the batch input in Text file
181
+ def save_to_txt(all_rows, summary_text, flag_text, filename):
182
+ if isinstance(all_rows, pd.DataFrame):
183
+ detailed_results = all_rows.to_dict(orient="records")
184
+ output = ""
185
+ output += ",".join(list(detailed_results[0].keys())) + "\n\n"
186
+ for r in detailed_results:
187
+ output += ",".join([str(v) for v in r.values()]) + "\n\n"
188
+ with open(filename, "w") as f:
189
+ f.write("=== Detailed Results ===\n")
190
+ f.write(output + "\n")
191
+
192
+ f.write("\n=== Summary ===\n")
193
+ f.write(summary_text + "\n")
194
+
195
+ f.write("\n=== Ancient/Modern Flag ===\n")
196
+ f.write(flag_text + "\n")
197
+
198
+ def save_batch_output(all_rows, summary_text, flag_text, output_type):
199
+ tmp_dir = tempfile.mkdtemp()
200
+
201
+ #html_table = all_rows.value # assuming this is stored somewhere
202
+
203
+ # Parse back to DataFrame
204
+ #all_rows = pd.read_html(all_rows)[0] # [0] because read_html returns a list
205
+ all_rows = pd.read_html(StringIO(all_rows))[0]
206
+ print(all_rows)
207
+
208
+ if output_type == "Excel":
209
+ file_path = f"{tmp_dir}/batch_output.xlsx"
210
+ save_to_excel(all_rows, summary_text, flag_text, file_path)
211
+ elif output_type == "JSON":
212
+ file_path = f"{tmp_dir}/batch_output.json"
213
+ save_to_json(all_rows, summary_text, flag_text, file_path)
214
+ print("Done with JSON")
215
+ elif output_type == "TXT":
216
+ file_path = f"{tmp_dir}/batch_output.txt"
217
+ save_to_txt(all_rows, summary_text, flag_text, file_path)
218
+ else:
219
+ return gr.update(visible=False) # invalid option
220
+
221
+ return gr.update(value=file_path, visible=True)
222
+
223
+ # run the batch
224
+ def summarize_batch(file=None, raw_text=""):
225
+ accessions, error = extract_accessions_from_input(file, raw_text)
226
+ if error:
227
+ return [], "", "", f"Error: {error}"
228
+
229
+ all_rows = []
230
+ all_summaries = []
231
+ all_flags = []
232
+
233
+ for acc in accessions:
234
+ try:
235
+ rows, summary, label, explain = summarize_results(acc)
236
+ all_rows.extend(rows)
237
+ all_summaries.append(f"**{acc}**\n{summary}")
238
+ all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
239
+ except Exception as e:
240
+ all_summaries.append(f"**{acc}**: Failed - {e}")
241
+
242
+ """for row in all_rows:
243
+ source_column = row[2] # Assuming the "Source" is in the 3rd column (index 2)
244
+
245
+ if source_column.startswith("http"): # Check if the source is a URL
246
+ # Wrap it with HTML anchor tags to make it clickable
247
+ row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
248
+
249
+
250
+ summary_text = "\n\n---\n\n".join(all_summaries)
251
+ flag_text = "\n\n---\n\n".join(all_flags)
252
+ return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
mtdna_classifier.py CHANGED
@@ -1,322 +1,519 @@
1
- # mtDNA Location Classifier MVP (Google Colab)
2
- # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
- import os
4
- import subprocess
5
- import re
6
- from Bio import Entrez
7
- import fitz
8
- import spacy
9
- from spacy.cli import download
10
- from NER.PDF import pdf
11
- from NER.WordDoc import wordDoc
12
- from NER.html import extractHTML
13
- from NER.word2Vec import word2vec
14
- from transformers import pipeline
15
- # Set your email (required by NCBI Entrez)
16
- #Entrez.email = "[email protected]"
17
- import nltk
18
-
19
- nltk.download("stopwords")
20
- nltk.download("punkt")
21
- nltk.download('punkt_tab')
22
- # Step 1: Get PubMed ID from Accession using EDirect
23
-
24
- '''def get_info_from_accession(accession):
25
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
26
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
27
- output = result.stdout
28
- pubmedID, isolate = "", ""
29
- for line in output.split("\n"):
30
- if len(line) > 0:
31
- if "PUBMED" in line:
32
- pubmedID = line.split()[-1]
33
- if "isolate" in line: # Check for isolate information
34
- # Try direct GenBank annotation: /isolate="XXX"
35
- match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
36
- if match1:
37
- isolate = match1.group(1)
38
- else:
39
- # Try from DEFINITION line: ...isolate XXX...
40
- match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
- if match2:
42
- isolate = match2.group(1)'''
43
- from Bio import Entrez, Medline
44
- import re
45
-
46
- Entrez.email = "[email protected]"
47
-
48
- def get_info_from_accession(accession):
49
- try:
50
- handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
51
- text = handle.read()
52
- handle.close()
53
-
54
- # Extract PUBMED ID from the Medline text
55
- pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
56
- pubmed_id = pubmed_match.group(1) if pubmed_match else ""
57
-
58
- # Extract isolate if available
59
- isolate_match = re.search(r'/isolate="([^"]+)"', text)
60
- if not isolate_match:
61
- isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
62
- isolate = isolate_match.group(1) if isolate_match else ""
63
-
64
- if not pubmed_id:
65
- print(f"⚠️ No PubMed ID found for accession {accession}")
66
-
67
- return pubmed_id, isolate
68
-
69
- except Exception as e:
70
- print("❌ Entrez error:", e)
71
- return "", ""
72
- # Step 2: Get doi link to access the paper
73
- '''def get_doi_from_pubmed_id(pubmed_id):
74
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
75
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
76
- output = result.stdout
77
-
78
- doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
79
- match = re.search(doi_pattern, output, re.IGNORECASE)
80
-
81
- if match:
82
- return match.group(0)
83
- else:
84
- return None # or raise an Exception with a helpful message'''
85
-
86
- def get_doi_from_pubmed_id(pubmed_id):
87
- try:
88
- handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
89
- records = list(Medline.parse(handle))
90
- handle.close()
91
-
92
- if not records:
93
- return None
94
-
95
- record = records[0]
96
- if "AID" in record:
97
- for aid in record["AID"]:
98
- if "[doi]" in aid:
99
- return aid.split(" ")[0] # extract the DOI
100
-
101
- return None
102
-
103
- except Exception as e:
104
- print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
105
- return None
106
-
107
-
108
- # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
109
- # Step 3.1: Extract Text
110
- def get_paper_text(doi,id):
111
- # create the temporary folder to contain the texts
112
- cmd = f'mkdir data/{id}'
113
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
114
- saveLinkFolder = "data/"+id
115
-
116
- link = 'https://doi.org/' + doi
117
- '''textsToExtract = { "doiLink":"paperText"
118
- "file1.pdf":"text1",
119
- "file2.doc":"text2",
120
- "file3.xlsx":excelText3'''
121
- textsToExtract = {}
122
- # get the file to create listOfFile for each id
123
- html = extractHTML.HTML("",link)
124
- jsonSM = html.getSupMaterial()
125
- text = ""
126
- links = [link] + sum((jsonSM[key] for key in jsonSM),[])
127
- #print(links)
128
- for l in links:
129
- # get the main paper
130
- if l == link:
131
- text = html.getListSection()
132
- textsToExtract[link] = text
133
- elif l.endswith(".pdf"):
134
- p = pdf.PDF(l,saveLinkFolder,doi)
135
- f = p.openPDFFile()
136
- pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
137
- doc = fitz.open(pdf_path)
138
- text = "\n".join([page.get_text() for page in doc])
139
- textsToExtract[l] = text
140
- elif l.endswith(".doc") or l.endswith(".docx"):
141
- d = wordDoc.wordDoc(l,saveLinkFolder)
142
- text = d.extractTextByPage()
143
- textsToExtract[l] = text
144
- elif l.split(".")[-1].lower() in "xlsx":
145
- wc = word2vec.word2Vec()
146
- corpus = wc.tableTransformToCorpusText([],l)
147
- text = ''
148
- for c in corpus:
149
- para = corpus[c]
150
- for words in para:
151
- text += " ".join(words)
152
- textsToExtract[l] = text
153
- # delete folder after finishing getting text
154
- cmd = f'rm -r data/{id}'
155
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
156
- return textsToExtract
157
- # Step 3.2: Extract context
158
- def extract_context(text, keyword, window=500):
159
- idx = text.find(keyword)
160
- if idx == -1:
161
- return "Sample ID not found."
162
- return text[max(0, idx-window): idx+window]
163
- # Step 4: Classification for now (demo purposes)
164
- # 4.1: Using a HuggingFace model (question-answering)
165
- def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
166
- try:
167
- qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
168
- result = qa({"context": context, "question": question})
169
- return result.get("answer", "Unknown")
170
- except Exception as e:
171
- return f"Error: {str(e)}"
172
-
173
- # 4.2: Infer from haplogroup
174
- # Load pre-trained spaCy model for NER
175
- try:
176
- nlp = spacy.load("en_core_web_sm")
177
- except OSError:
178
- download("en_core_web_sm")
179
- nlp = spacy.load("en_core_web_sm")
180
-
181
- nlp = spacy.load("en_core_web_sm")
182
- # Define the haplogroup-to-region mapping (simple rule-based)
183
- import csv
184
-
185
- def load_haplogroup_mapping(csv_path):
186
- mapping = {}
187
- with open(csv_path) as f:
188
- reader = csv.DictReader(f)
189
- for row in reader:
190
- mapping[row["haplogroup"]] = [row["region"],row["source"]]
191
- return mapping
192
-
193
- # Function to extract haplogroup from the text
194
- def extract_haplogroup(text):
195
- match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
196
- if match:
197
- submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
198
- if submatch:
199
- return submatch.group(0)
200
- else:
201
- return match.group(1) # fallback
202
- fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
203
- if fallback:
204
- return fallback.group(1)
205
- return None
206
-
207
-
208
- # Function to extract location based on NER
209
- def extract_location(text):
210
- doc = nlp(text)
211
- locations = []
212
- for ent in doc.ents:
213
- if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
214
- locations.append(ent.text)
215
- return locations
216
-
217
- # Function to infer location from haplogroup
218
- def infer_location_from_haplogroup(haplogroup):
219
- haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
220
- return haplo_map.get(haplogroup, ["Unknown","Unknown"])
221
-
222
- # Function to classify the mtDNA sample
223
- def classify_mtDNA_sample_from_haplo(text):
224
- # Extract haplogroup
225
- haplogroup = extract_haplogroup(text)
226
- # Extract location based on NER
227
- locations = extract_location(text)
228
- # Infer location based on haplogroup
229
- inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
230
- return {
231
- "source":sourceHaplo,
232
- "locations_found_in_context": locations,
233
- "haplogroup": haplogroup,
234
- "inferred_location": inferred_location
235
-
236
- }
237
- # 4.3 Get from available NCBI
238
- def infer_location_fromNCBI(accession):
239
- try:
240
- handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
241
- text = handle.read()
242
- handle.close()
243
- match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
244
- if match:
245
- return match.group(2), match.group(0) # This is the value like "Brunei"
246
- return None
247
-
248
- except Exception as e:
249
- print("❌ Entrez error:", e)
250
- return "",""
251
-
252
-
253
- # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
254
- def classify_sample_location(accession):
255
- outputs = {}
256
- keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
257
- # Step 1: get pubmed id and isolate
258
- pubmedID, isolate = get_info_from_accession(accession)
259
- if not pubmedID:
260
- return {"error": f"Could not retrieve PubMed ID for accession {accession}"}
261
- if not isolate:
262
- isolate = "UNKNOWN_ISOLATE"
263
- # Step 2: get doi
264
- doi = get_doi_from_pubmed_id(pubmedID)
265
- if not doi:
266
- return {"error": "DOI not found for this accession. Cannot fetch paper or context."}
267
-
268
- # Step 3: get text
269
- '''textsToExtract = { "doiLink":"paperText"
270
- "file1.pdf":"text1",
271
- "file2.doc":"text2",
272
- "file3.xlsx":excelText3'''
273
- textsToExtract = get_paper_text(doi,pubmedID)
274
- if not textsToExtract:
275
- return {"error": f"No texts extracted for DOI {doi}"}
276
-
277
- # Step 4: prediction
278
- outputs[accession] = {}
279
- outputs[isolate] = {}
280
- # 4.0 Infer from NCBI
281
- location, outputNCBI = infer_location_fromNCBI(accession)
282
- NCBI_result = {
283
- "source": "NCBI",
284
- "sample_id": accession,
285
- "predicted_location": location,
286
- "context_snippet": outputNCBI}
287
- outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
288
- for key in textsToExtract:
289
- text = textsToExtract[key]
290
- # try accession number first
291
- outputs[accession][key] = {}
292
- keyword = accession
293
- context = extract_context(text, keyword, window=500)
294
- # 4.1: Using a HuggingFace model (question-answering)
295
- location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
296
- qa_result = {
297
- "source": key,
298
- "sample_id": keyword,
299
- "predicted_location": location,
300
- "context_snippet": context
301
- }
302
- outputs[keyword][key]["QAModel"] = qa_result
303
- # 4.2: Infer from haplogroup
304
- haplo_result = classify_mtDNA_sample_from_haplo(context)
305
- outputs[keyword][key]["haplogroup"] = haplo_result
306
- # try isolate
307
- keyword = isolate
308
- outputs[isolate][key] = {}
309
- context = extract_context(text, keyword, window=500)
310
- # 4.1.1: Using a HuggingFace model (question-answering)
311
- location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
312
- qa_result = {
313
- "source": key,
314
- "sample_id": keyword,
315
- "predicted_location": location,
316
- "context_snippet": context
317
- }
318
- outputs[keyword][key]["QAModel"] = qa_result
319
- # 4.2.1: Infer from haplogroup
320
- haplo_result = classify_mtDNA_sample_from_haplo(context)
321
- outputs[keyword][key]["haplogroup"] = haplo_result
322
- return outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mtDNA Location Classifier MVP (Google Colab)
2
+ # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
+ import os
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ import fitz
8
+ import spacy
9
+ from spacy.cli import download
10
+ from NER.PDF import pdf
11
+ from NER.WordDoc import wordDoc
12
+ from NER.html import extractHTML
13
+ from NER.word2Vec import word2vec
14
+ from transformers import pipeline
15
+ import urllib.parse, requests
16
+ from pathlib import Path
17
+ from upgradeClassify import filter_context_for_sample, infer_location_for_sample
18
+ # Set your email (required by NCBI Entrez)
19
+ #Entrez.email = "[email protected]"
20
+ import nltk
21
+
22
+ nltk.download("stopwords")
23
+ nltk.download("punkt")
24
+ nltk.download('punkt_tab')
25
+ # Step 1: Get PubMed ID from Accession using EDirect
26
+
27
+ '''def get_info_from_accession(accession):
28
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
29
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
30
+ output = result.stdout
31
+ pubmedID, isolate = "", ""
32
+ for line in output.split("\n"):
33
+ if len(line) > 0:
34
+ if "PUBMED" in line:
35
+ pubmedID = line.split()[-1]
36
+ if "isolate" in line: # Check for isolate information
37
+ # Try direct GenBank annotation: /isolate="XXX"
38
+ match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
39
+ if match1:
40
+ isolate = match1.group(1)
41
+ else:
42
+ # Try from DEFINITION line: ...isolate XXX...
43
+ match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
44
+ if match2:
45
+ isolate = match2.group(1)'''
46
+ from Bio import Entrez, Medline
47
+ import re
48
+
49
+ Entrez.email = "[email protected]"
50
+
51
+ def get_info_from_accession(accession):
52
+ try:
53
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
54
+ text = handle.read()
55
+ handle.close()
56
+
57
+ # Extract PUBMED ID from the Medline text
58
+ pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
59
+ pubmed_id = pubmed_match.group(1) if pubmed_match else ""
60
+
61
+ # Extract isolate if available
62
+ isolate_match = re.search(r'/isolate="([^"]+)"', text)
63
+ if not isolate_match:
64
+ isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
65
+ isolate = isolate_match.group(1) if isolate_match else ""
66
+
67
+ if not pubmed_id:
68
+ print(f"⚠️ No PubMed ID found for accession {accession}")
69
+
70
+ return pubmed_id, isolate
71
+
72
+ except Exception as e:
73
+ print("❌ Entrez error:", e)
74
+ return "", ""
75
+ # Step 2: Get doi link to access the paper
76
+ '''def get_doi_from_pubmed_id(pubmed_id):
77
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
78
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
79
+ output = result.stdout
80
+
81
+ doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
82
+ match = re.search(doi_pattern, output, re.IGNORECASE)
83
+
84
+ if match:
85
+ return match.group(0)
86
+ else:
87
+ return None # or raise an Exception with a helpful message'''
88
+
89
+ def get_doi_from_pubmed_id(pubmed_id):
90
+ try:
91
+ handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
92
+ records = list(Medline.parse(handle))
93
+ handle.close()
94
+
95
+ if not records:
96
+ return None
97
+
98
+ record = records[0]
99
+ if "AID" in record:
100
+ for aid in record["AID"]:
101
+ if "[doi]" in aid:
102
+ return aid.split(" ")[0] # extract the DOI
103
+
104
+ return None
105
+
106
+ except Exception as e:
107
+ print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
108
+ return None
109
+
110
+
111
+ # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
112
+ # Step 3.1: Extract Text
113
+ # sub: download excel file
114
+ def download_excel_file(url, save_path="temp.xlsx"):
115
+ if "view.officeapps.live.com" in url:
116
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
117
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
118
+ response = requests.get(real_url)
119
+ with open(save_path, "wb") as f:
120
+ f.write(response.content)
121
+ return save_path
122
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
123
+ response = requests.get(url)
124
+ response.raise_for_status() # Raises error if download fails
125
+ with open(save_path, "wb") as f:
126
+ f.write(response.content)
127
+ return save_path
128
+ else:
129
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
130
+ return url
131
+ def get_paper_text(doi,id,manualLinks=None):
132
+ # create the temporary folder to contain the texts
133
+ folder_path = Path("data/"+str(id))
134
+ if not folder_path.exists():
135
+ cmd = f'mkdir data/{id}'
136
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
137
+ print("data/"+str(id) +" created.")
138
+ else:
139
+ print("data/"+str(id) +" already exists.")
140
+ saveLinkFolder = "data/"+id
141
+
142
+ link = 'https://doi.org/' + doi
143
+ '''textsToExtract = { "doiLink":"paperText"
144
+ "file1.pdf":"text1",
145
+ "file2.doc":"text2",
146
+ "file3.xlsx":excelText3'''
147
+ textsToExtract = {}
148
+ # get the file to create listOfFile for each id
149
+ html = extractHTML.HTML("",link)
150
+ jsonSM = html.getSupMaterial()
151
+ text = ""
152
+ links = [link] + sum((jsonSM[key] for key in jsonSM),[])
153
+ if manualLinks != None:
154
+ links += manualLinks
155
+ for l in links:
156
+ # get the main paper
157
+ name = l.split("/")[-1]
158
+ file_path = folder_path / name
159
+ if l == link:
160
+ text = html.getListSection()
161
+ textsToExtract[link] = text
162
+ elif l.endswith(".pdf"):
163
+ if file_path.is_file():
164
+ l = saveLinkFolder + "/" + name
165
+ print("File exists.")
166
+ p = pdf.PDF(l,saveLinkFolder,doi)
167
+ f = p.openPDFFile()
168
+ pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
169
+ doc = fitz.open(pdf_path)
170
+ text = "\n".join([page.get_text() for page in doc])
171
+ textsToExtract[l] = text
172
+ elif l.endswith(".doc") or l.endswith(".docx"):
173
+ d = wordDoc.wordDoc(l,saveLinkFolder)
174
+ text = d.extractTextByPage()
175
+ textsToExtract[l] = text
176
+ elif l.split(".")[-1].lower() in "xlsx":
177
+ wc = word2vec.word2Vec()
178
+ # download excel file if it not downloaded yet
179
+ savePath = saveLinkFolder +"/"+ l.split("/")[-1]
180
+ excelPath = download_excel_file(l, savePath)
181
+ corpus = wc.tableTransformToCorpusText([],excelPath)
182
+ text = ''
183
+ for c in corpus:
184
+ para = corpus[c]
185
+ for words in para:
186
+ text += " ".join(words)
187
+ textsToExtract[l] = text
188
+ # delete folder after finishing getting text
189
+ #cmd = f'rm -r data/{id}'
190
+ #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
191
+ return textsToExtract
192
+ # Step 3.2: Extract context
193
+ def extract_context(text, keyword, window=500):
194
+ # firstly try accession number
195
+ idx = text.find(keyword)
196
+ if idx == -1:
197
+ return "Sample ID not found."
198
+ return text[max(0, idx-window): idx+window]
199
+ def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
200
+ if keep_if is None:
201
+ keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
202
+
203
+ outputs = ""
204
+ text = text.lower()
205
+
206
+ # If isolate is provided, prioritize paragraphs that mention it
207
+ # If isolate is provided, prioritize paragraphs that mention it
208
+ if accession and accession.lower() in text:
209
+ if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
210
+ outputs += extract_context(text, accession.lower(), window=700)
211
+ if isolate and isolate.lower() in text:
212
+ if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
213
+ outputs += extract_context(text, isolate.lower(), window=700)
214
+ for keyword in keep_if:
215
+ para = extract_context(text, keyword)
216
+ if para and para not in outputs:
217
+ outputs += para + "\n"
218
+ return outputs
219
+ # Step 4: Classification for now (demo purposes)
220
+ # 4.1: Using a HuggingFace model (question-answering)
221
+ def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
222
+ try:
223
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
224
+ result = qa({"context": context, "question": question})
225
+ return result.get("answer", "Unknown")
226
+ except Exception as e:
227
+ return f"Error: {str(e)}"
228
+
229
+ # 4.2: Infer from haplogroup
230
+ # Load pre-trained spaCy model for NER
231
+ try:
232
+ nlp = spacy.load("en_core_web_sm")
233
+ except OSError:
234
+ download("en_core_web_sm")
235
+ nlp = spacy.load("en_core_web_sm")
236
+
237
+ # Define the haplogroup-to-region mapping (simple rule-based)
238
+ import csv
239
+
240
+ def load_haplogroup_mapping(csv_path):
241
+ mapping = {}
242
+ with open(csv_path) as f:
243
+ reader = csv.DictReader(f)
244
+ for row in reader:
245
+ mapping[row["haplogroup"]] = [row["region"],row["source"]]
246
+ return mapping
247
+
248
+ # Function to extract haplogroup from the text
249
+ def extract_haplogroup(text):
250
+ match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
251
+ if match:
252
+ submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
253
+ if submatch:
254
+ return submatch.group(0)
255
+ else:
256
+ return match.group(1) # fallback
257
+ fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
258
+ if fallback:
259
+ return fallback.group(1)
260
+ return None
261
+
262
+
263
+ # Function to extract location based on NER
264
+ def extract_location(text):
265
+ doc = nlp(text)
266
+ locations = []
267
+ for ent in doc.ents:
268
+ if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
269
+ locations.append(ent.text)
270
+ return locations
271
+
272
+ # Function to infer location from haplogroup
273
+ def infer_location_from_haplogroup(haplogroup):
274
+ haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
275
+ return haplo_map.get(haplogroup, ["Unknown","Unknown"])
276
+
277
+ # Function to classify the mtDNA sample
278
+ def classify_mtDNA_sample_from_haplo(text):
279
+ # Extract haplogroup
280
+ haplogroup = extract_haplogroup(text)
281
+ # Extract location based on NER
282
+ locations = extract_location(text)
283
+ # Infer location based on haplogroup
284
+ inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
285
+ return {
286
+ "source":sourceHaplo,
287
+ "locations_found_in_context": locations,
288
+ "haplogroup": haplogroup,
289
+ "inferred_location": inferred_location
290
+
291
+ }
292
+ # 4.3 Get from available NCBI
293
+ def infer_location_fromNCBI(accession):
294
+ try:
295
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
296
+ text = handle.read()
297
+ handle.close()
298
+ match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
299
+ if match:
300
+ return match.group(2), match.group(0) # This is the value like "Brunei"
301
+ return "Not found", "Not found"
302
+
303
+ except Exception as e:
304
+ print("❌ Entrez error:", e)
305
+ return "Not found", "Not found"
306
+
307
+ ### ANCIENT/MODERN FLAG
308
+ from Bio import Entrez
309
+ import re
310
+
311
+ def flag_ancient_modern(accession, textsToExtract, isolate=None):
312
+ """
313
+ Try to classify a sample as Ancient or Modern using:
314
+ 1. NCBI accession (if available)
315
+ 2. Supplementary text or context fallback
316
+ """
317
+ context = ""
318
+ label, explain = "", ""
319
+
320
+ try:
321
+ # Check if we can fetch metadata from NCBI using the accession
322
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
323
+ text = handle.read()
324
+ handle.close()
325
+
326
+ isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
327
+ if isolate_source:
328
+ context += isolate_source.group(0) + " "
329
+
330
+ specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
331
+ if specimen:
332
+ context += specimen.group(0) + " "
333
+
334
+ if context.strip():
335
+ label, explain = detect_ancient_flag(context)
336
+ if label!="Unknown":
337
+ return label, explain + " from NCBI\n(" + context + ")"
338
+
339
+ # If no useful NCBI metadata, check supplementary texts
340
+ if textsToExtract:
341
+ labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
342
+
343
+ for source in textsToExtract:
344
+ text_block = textsToExtract[source]
345
+ context = extract_relevant_paragraphs(text_block, accession, isolate=isolate) # Reduce to informative paragraph(s)
346
+ label, explain = detect_ancient_flag(context)
347
+
348
+ if label == "Ancient":
349
+ labels["ancient"][0] += 1
350
+ labels["ancient"][1] += f"{source}:\n{explain}\n\n"
351
+ elif label == "Modern":
352
+ labels["modern"][0] += 1
353
+ labels["modern"][1] += f"{source}:\n{explain}\n\n"
354
+ else:
355
+ labels["unknown"] += 1
356
+
357
+ if max(labels["modern"][0],labels["ancient"][0]) > 0:
358
+ if labels["modern"][0] > labels["ancient"][0]:
359
+ return "Modern", labels["modern"][1]
360
+ else:
361
+ return "Ancient", labels["ancient"][1]
362
+ else:
363
+ return "Unknown", "No strong keywords detected"
364
+ else:
365
+ print("No DOI or PubMed ID available for inference.")
366
+ return "", ""
367
+
368
+ except Exception as e:
369
+ print("Error:", e)
370
+ return "", ""
371
+
372
+
373
+ def detect_ancient_flag(context_snippet):
374
+ context = context_snippet.lower()
375
+
376
+ ancient_keywords = [
377
+ "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
378
+ "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
379
+ "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
380
+ "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
381
+ ]
382
+
383
+ modern_keywords = [
384
+ "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
385
+ "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
386
+ "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
387
+ "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
388
+ "bioinformatic analysis", "samples from", "population genetics", "genome-wide data"
389
+ ]
390
+
391
+ ancient_hits = [k for k in ancient_keywords if k in context]
392
+ modern_hits = [k for k in modern_keywords if k in context]
393
+
394
+ if ancient_hits and not modern_hits:
395
+ return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
396
+ elif modern_hits and not ancient_hits:
397
+ return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
398
+ elif ancient_hits and modern_hits:
399
+ if len(ancient_hits) >= len(modern_hits):
400
+ return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
401
+ else:
402
+ return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
403
+
404
+ # Fallback to QA
405
+ answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
406
+ if answer.startswith("Error"):
407
+ return "Unknown", answer
408
+ if "ancient" in answer.lower():
409
+ return "Ancient", f"Leaning ancient based on QA: {answer}"
410
+ elif "modern" in answer.lower():
411
+ return "Modern", f"Leaning modern based on QA: {answer}"
412
+ else:
413
+ return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
414
+
415
+ # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
416
+ def classify_sample_location(accession):
417
+ outputs = {}
418
+ keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
419
+ # Step 1: get pubmed id and isolate
420
+ pubmedID, isolate = get_info_from_accession(accession)
421
+ '''if not pubmedID:
422
+ return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
423
+ if not isolate:
424
+ isolate = "UNKNOWN_ISOLATE"
425
+ # Step 2: get doi
426
+ doi = get_doi_from_pubmed_id(pubmedID)
427
+ '''if not doi:
428
+ return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
429
+ # Step 3: get text
430
+ '''textsToExtract = { "doiLink":"paperText"
431
+ "file1.pdf":"text1",
432
+ "file2.doc":"text2",
433
+ "file3.xlsx":excelText3'''
434
+ if doi and pubmedID:
435
+ textsToExtract = get_paper_text(doi,pubmedID)
436
+ else: textsToExtract = {}
437
+ '''if not textsToExtract:
438
+ return {"error": f"No texts extracted for DOI {doi}"}'''
439
+ if isolate not in [None, "UNKNOWN_ISOLATE"]:
440
+ label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
441
+ else:
442
+ label, explain = flag_ancient_modern(accession,textsToExtract)
443
+ # Step 4: prediction
444
+ outputs[accession] = {}
445
+ outputs[isolate] = {}
446
+ # 4.0 Infer from NCBI
447
+ location, outputNCBI = infer_location_fromNCBI(accession)
448
+ NCBI_result = {
449
+ "source": "NCBI",
450
+ "sample_id": accession,
451
+ "predicted_location": location,
452
+ "context_snippet": outputNCBI}
453
+ outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
454
+ if textsToExtract:
455
+ long_text = ""
456
+ for key in textsToExtract:
457
+ text = textsToExtract[key]
458
+ # try accession number first
459
+ outputs[accession][key] = {}
460
+ keyword = accession
461
+ context = extract_context(text, keyword, window=500)
462
+ # 4.1: Using a HuggingFace model (question-answering)
463
+ location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
464
+ qa_result = {
465
+ "source": key,
466
+ "sample_id": keyword,
467
+ "predicted_location": location,
468
+ "context_snippet": context
469
+ }
470
+ outputs[keyword][key]["QAModel"] = qa_result
471
+ # 4.2: Infer from haplogroup
472
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
473
+ outputs[keyword][key]["haplogroup"] = haplo_result
474
+ # try isolate
475
+ keyword = isolate
476
+ outputs[isolate][key] = {}
477
+ context = extract_context(text, keyword, window=500)
478
+ # 4.1.1: Using a HuggingFace model (question-answering)
479
+ location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
480
+ qa_result = {
481
+ "source": key,
482
+ "sample_id": keyword,
483
+ "predicted_location": location,
484
+ "context_snippet": context
485
+ }
486
+ outputs[keyword][key]["QAModel"] = qa_result
487
+ # 4.2.1: Infer from haplogroup
488
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
489
+ outputs[keyword][key]["haplogroup"] = haplo_result
490
+ # add long text
491
+ long_text += text + ". \n"
492
+ # 4.3: UpgradeClassify
493
+ # try sample_id as accession number
494
+ sample_id = accession
495
+ if sample_id:
496
+ filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
497
+ locations = infer_location_for_sample(sample_id.upper(), filtered_context)
498
+ if locations!="No clear location found in top matches":
499
+ outputs[sample_id]["upgradeClassifier"] = {}
500
+ outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
501
+ "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
502
+ "sample_id": sample_id,
503
+ "predicted_location": ", ".join(locations),
504
+ "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
505
+ }
506
+ # try sample_id as isolate name
507
+ sample_id = isolate
508
+ if sample_id:
509
+ filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
510
+ locations = infer_location_for_sample(sample_id.upper(), filtered_context)
511
+ if locations!="No clear location found in top matches":
512
+ outputs[sample_id]["upgradeClassifier"] = {}
513
+ outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
514
+ "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
515
+ "sample_id": sample_id,
516
+ "predicted_location": ", ".join(locations),
517
+ "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
518
+ }
519
+ return outputs, label, explain
mtdna_ui.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from mtdna_backend import *
3
+ import json
4
+ # Gradio UI
5
+ with gr.Blocks() as interface:
6
+ gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
7
+
8
+ inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
9
+
10
+ with gr.Group() as single_input_group:
11
+ single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
12
+
13
+ with gr.Group(visible=False) as batch_input_group:
14
+ raw_text = gr.Textbox(label="🧬 Paste Accession Numbers (e.g., MF362736.1,MF362738.1,KU131308,MW291678)")
15
+ gr.HTML("""<a href="https://drive.google.com/file/d/1t-TFeIsGVu5Jh3CUZS-VE9jQWzNFCs_c/view?usp=sharing" download target="_blank">Download Example CSV Format</a>""")
16
+ gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing&ouid=112390323314156876153&rtpof=true&sd=true" download target="_blank">Download Example Excel Format</a>""")
17
+ file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
18
+
19
+
20
+
21
+ with gr.Row():
22
+ run_button = gr.Button("🔍 Submit and Classify")
23
+ reset_button = gr.Button("🔄 Reset")
24
+
25
+ status = gr.Markdown(visible=False)
26
+
27
+ with gr.Group(visible=False) as results_group:
28
+ with gr.Accordion("Open to See the Result", open=False) as results:
29
+ with gr.Row():
30
+ output_summary = gr.Markdown(elem_id="output-summary")
31
+ output_flag = gr.Markdown(elem_id="output-flag")
32
+
33
+ gr.Markdown("---")
34
+
35
+ with gr.Accordion("Open to See the Output Table", open=False) as table_accordion:
36
+ """output_table = gr.Dataframe(
37
+ headers=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"],
38
+ interactive=False,
39
+ row_count=(5, "dynamic")
40
+ )"""
41
+ output_table = gr.HTML(render=True)
42
+
43
+
44
+ with gr.Row():
45
+ output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
46
+ download_button = gr.Button("⬇️ Download Output")
47
+ download_file = gr.File(label="Download File Here",visible=False)
48
+
49
+ gr.Markdown("---")
50
+
51
+ gr.Markdown("### 💬 Feedback (required)")
52
+ q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
53
+ q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
54
+ contact = gr.Textbox(label="📧 Your email or institution (optional)")
55
+ submit_feedback = gr.Button("✅ Submit Feedback")
56
+ feedback_status = gr.Markdown()
57
+
58
+ # Functions
59
+
60
+ def toggle_input_mode(mode):
61
+ if mode == "Single Accession":
62
+ return gr.update(visible=True), gr.update(visible=False)
63
+ else:
64
+ return gr.update(visible=False), gr.update(visible=True)
65
+
66
+ def classify_with_loading():
67
+ return gr.update(value="⏳ Please wait... processing...",visible=True) # Show processing message
68
+
69
+ def classify_dynamic(single_accession, file, text, mode):
70
+ if mode == "Single Accession":
71
+ return classify_main(single_accession) + (gr.update(visible=False),)
72
+ else:
73
+ #return summarize_batch(file, text) + (gr.update(visible=False),) # Hide processing message
74
+ return classify_mulAcc(file, text) + (gr.update(visible=False),) # Hide processing message
75
+
76
+ # for single accession
77
+ def classify_main(accession):
78
+ table, summary, labelAncient_Modern, explain_label = summarize_results(accession)
79
+ flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
80
+ return (
81
+ #table,
82
+ make_html_table(table),
83
+ summary,
84
+ flag_output,
85
+ gr.update(visible=True),
86
+ gr.update(visible=False)
87
+ )
88
+ # for batch accessions
89
+ def classify_mulAcc(file, text):
90
+ table, summary, flag_output, gr1, gr2 = summarize_batch(file, text)
91
+ #flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
92
+ return (
93
+ #table,
94
+ make_html_table(table),
95
+ summary,
96
+ flag_output,
97
+ gr.update(visible=True),
98
+ gr.update(visible=False)
99
+ )
100
+
101
+ def make_html_table(rows):
102
+ html = """
103
+ <div style='overflow-x: auto; padding: 10px;'>
104
+ <div style='max-height: 400px; overflow-y: auto; border: 1px solid #444; border-radius: 8px;'>
105
+ <table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: #f1f1f1; background-color: #1e1e1e;'>
106
+ <thead style='position: sticky; top: 0; background-color: #2c2c2c; z-index: 1;'>
107
+ <tr>
108
+ """
109
+ headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
110
+ html += "".join(
111
+ f"<th style='padding: 10px; border: 1px solid #555; text-align: left; white-space: nowrap;'>{h}</th>"
112
+ for h in headers
113
+ )
114
+ html += "</tr></thead><tbody>"
115
+
116
+ for row in rows:
117
+ html += "<tr>"
118
+ for i, col in enumerate(row):
119
+ header = headers[i]
120
+ style = "padding: 10px; border: 1px solid #555; vertical-align: top;"
121
+
122
+ # For specific columns like Haplogroup, force nowrap
123
+ if header in ["Haplogroup", "Sample ID", "Technique"]:
124
+ style += " white-space: nowrap; text-overflow: ellipsis; max-width: 200px; overflow: hidden;"
125
+
126
+ if header == "Source" and isinstance(col, str) and col.strip().lower().startswith("http"):
127
+ col = f"<a href='{col}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{col}</a>"
128
+
129
+ html += f"<td style='{style}'>{col}</td>"
130
+ html += "</tr>"
131
+
132
+ html += "</tbody></table></div></div>"
133
+ return html
134
+
135
+
136
+ def reset_fields():
137
+ return (
138
+ gr.update(value=""), # single_accession
139
+ gr.update(value=""), # raw_text
140
+ gr.update(value=None), # file_upload
141
+ gr.update(value="Single Accession"), # inputMode
142
+ gr.update(value=[], visible=True), # output_table
143
+ gr.update(value="", visible=True), # output_summary
144
+ gr.update(value="", visible=True), # output_flag
145
+ gr.update(visible=False), # status
146
+ gr.update(visible=False) # results_group
147
+ )
148
+
149
+ inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
150
+ run_button.click(fn=classify_with_loading, inputs=[], outputs=[status])
151
+ run_button.click(
152
+ fn=classify_dynamic,
153
+ inputs=[single_accession, file_upload, raw_text, inputMode],
154
+ outputs=[output_table, output_summary, output_flag, results_group, status]
155
+ )
156
+ reset_button.click(
157
+ fn=reset_fields,
158
+ inputs=[],
159
+ outputs=[
160
+ single_accession, raw_text, file_upload, inputMode,
161
+ output_table, output_summary, output_flag,
162
+ status, results_group
163
+ ]
164
+ )
165
+
166
+ download_button.click(
167
+ fn=save_batch_output,
168
+ inputs=[output_table, output_summary, output_flag, output_type],
169
+ outputs=[download_file])
170
+
171
+ submit_feedback.click(
172
+ fn=store_feedback_to_google_sheets, inputs=[single_accession, q1, q2, contact], outputs=feedback_status
173
+ )
174
+ # Custom CSS styles
175
+ gr.HTML("""
176
+ <style>
177
+ /* Ensures both sections are equally spaced with the same background size */
178
+ #output-summary, #output-flag {
179
+ background-color: #f0f4f8; /* Light Grey for both */
180
+ padding: 20px;
181
+ border-radius: 10px;
182
+ margin-top: 10px;
183
+ width: 100%; /* Ensure full width */
184
+ min-height: 150px; /* Ensures both have a minimum height */
185
+ box-sizing: border-box; /* Prevents padding from increasing size */
186
+ display: flex;
187
+ flex-direction: column;
188
+ justify-content: space-between;
189
+ }
190
+
191
+ /* Specific background colors */
192
+ #output-summary {
193
+ background-color: #434a4b;
194
+ }
195
+
196
+ #output-flag {
197
+ background-color: #141616;
198
+ }
199
+
200
+ /* Ensuring they are in a row and evenly spaced */
201
+ .gradio-row {
202
+ display: flex;
203
+ justify-content: space-between;
204
+ width: 100%;
205
+ }
206
+ </style>
207
+ """)
208
+
209
+
210
+ interface.launch(share=True,debug=True)
output.json ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Detailed_Results": [
3
+ [
4
+ "MF362736.1 (accession number)",
5
+ "NCBI",
6
+ "NCBI",
7
+ "Armenia",
8
+ "",
9
+ "",
10
+ "/geo_loc_name=\"Armenia\""
11
+ ],
12
+ [
13
+ "MF362736.1 (accession number)",
14
+ "QAModel",
15
+ "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
16
+ "Sample ID not found",
17
+ "",
18
+ "",
19
+ "Sample ID not found."
20
+ ],
21
+ [
22
+ "MF362736.1 (accession number)",
23
+ "haplogroup",
24
+ "The region of haplogroup is inferred\nby using this source: Unknown",
25
+ "",
26
+ "Sample",
27
+ "Unknown",
28
+ ""
29
+ ],
30
+ [
31
+ "rise396_mt (isolate of accession)",
32
+ "QAModel",
33
+ "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
34
+ "Sample ID not found",
35
+ "",
36
+ "",
37
+ "Sample ID not found."
38
+ ],
39
+ [
40
+ "rise396_mt (isolate of accession)",
41
+ "haplogroup",
42
+ "The region of haplogroup is inferred\nby using this source: Unknown",
43
+ "",
44
+ "Sample",
45
+ "Unknown",
46
+ ""
47
+ ],
48
+ [
49
+ "MF362738.1 (accession number)",
50
+ "NCBI",
51
+ "NCBI",
52
+ "Armenia",
53
+ "",
54
+ "",
55
+ "/geo_loc_name=\"Armenia\""
56
+ ],
57
+ [
58
+ "MF362738.1 (accession number)",
59
+ "QAModel",
60
+ "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
61
+ "Sample ID not found",
62
+ "",
63
+ "",
64
+ "Sample ID not found."
65
+ ],
66
+ [
67
+ "MF362738.1 (accession number)",
68
+ "haplogroup",
69
+ "The region of haplogroup is inferred\nby using this source: Unknown",
70
+ "",
71
+ "Sample",
72
+ "Unknown",
73
+ ""
74
+ ],
75
+ [
76
+ "rise407_mt (isolate of accession)",
77
+ "QAModel",
78
+ "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
79
+ "Sample ID not found",
80
+ "",
81
+ "",
82
+ "Sample ID not found."
83
+ ],
84
+ [
85
+ "rise407_mt (isolate of accession)",
86
+ "haplogroup",
87
+ "The region of haplogroup is inferred\nby using this source: Unknown",
88
+ "",
89
+ "Sample",
90
+ "Unknown",
91
+ ""
92
+ ],
93
+ [
94
+ "MF362739.1 (accession number)",
95
+ "NCBI",
96
+ "NCBI",
97
+ "Armenia",
98
+ "",
99
+ "",
100
+ "/geo_loc_name=\"Armenia\""
101
+ ],
102
+ [
103
+ "MF362739.1 (accession number)",
104
+ "QAModel",
105
+ "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
106
+ "Sample ID not found",
107
+ "",
108
+ "",
109
+ "Sample ID not found."
110
+ ],
111
+ [
112
+ "MF362739.1 (accession number)",
113
+ "haplogroup",
114
+ "The region of haplogroup is inferred\nby using this source: Unknown",
115
+ "",
116
+ "Sample",
117
+ "Unknown",
118
+ ""
119
+ ],
120
+ [
121
+ "rise408_mt (isolate of accession)",
122
+ "QAModel",
123
+ "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
124
+ "Sample ID not found",
125
+ "",
126
+ "",
127
+ "Sample ID not found."
128
+ ],
129
+ [
130
+ "rise408_mt (isolate of accession)",
131
+ "haplogroup",
132
+ "The region of haplogroup is inferred\nby using this source: Unknown",
133
+ "",
134
+ "Sample",
135
+ "Unknown",
136
+ ""
137
+ ],
138
+ [
139
+ "KU131308 (accession number)",
140
+ "NCBI",
141
+ "NCBI",
142
+ "Brunei",
143
+ "",
144
+ "",
145
+ "/geo_loc_name=\"Brunei\""
146
+ ],
147
+ [
148
+ "KU131308 (accession number)",
149
+ "QAModel",
150
+ "<a href=\"https://doi.org/10.1007/s00439-015-1620-z\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1007/s00439-015-1620-z</a>",
151
+ "GenBank",
152
+ "",
153
+ "",
154
+ "t (unavailable at the start of this study). We performed whole-mtDNA sequencing as previously described (Torroni et al. 2001) using an ABI 48-capillary 3730 DNA Analyser (Taipei) an ABI 16-capillary 3130XL DNA Analyser (Leeds) and an ABI 16-capillary 3100 DNA Analyser (Porto). Details on the new and"
155
+ ],
156
+ [
157
+ "KU131308 (accession number)",
158
+ "haplogroup",
159
+ "The region of haplogroup is inferred\nby using this source: EMPOP",
160
+ "",
161
+ "M7",
162
+ "East Asia",
163
+ ""
164
+ ],
165
+ [
166
+ "KU131308 (accession number)",
167
+ "QAModel",
168
+ "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>",
169
+ "Sample ID not found",
170
+ "",
171
+ "",
172
+ "Sample ID not found."
173
+ ],
174
+ [
175
+ "KU131308 (accession number)",
176
+ "haplogroup",
177
+ "The region of haplogroup is inferred\nby using this source: Unknown",
178
+ "",
179
+ "Sample",
180
+ "Unknown",
181
+ ""
182
+ ],
183
+ [
184
+ "KU131308 (accession number)",
185
+ "QAModel",
186
+ "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>",
187
+ "Sample ID not found",
188
+ "",
189
+ "",
190
+ "Sample ID not found."
191
+ ],
192
+ [
193
+ "KU131308 (accession number)",
194
+ "haplogroup",
195
+ "The region of haplogroup is inferred\nby using this source: Unknown",
196
+ "",
197
+ "Sample",
198
+ "Unknown",
199
+ ""
200
+ ],
201
+ [
202
+ "BRU18 (isolate of accession)",
203
+ "QAModel",
204
+ "<a href=\"https://doi.org/10.1007/s00439-015-1620-z\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1007/s00439-015-1620-z</a>",
205
+ "Sample ID not found",
206
+ "",
207
+ "",
208
+ "Sample ID not found."
209
+ ],
210
+ [
211
+ "BRU18 (isolate of accession)",
212
+ "haplogroup",
213
+ "The region of haplogroup is inferred\nby using this source: Unknown",
214
+ "",
215
+ "Sample",
216
+ "Unknown",
217
+ ""
218
+ ],
219
+ [
220
+ "BRU18 (isolate of accession)",
221
+ "QAModel",
222
+ "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>",
223
+ "Borneo",
224
+ "",
225
+ "",
226
+ ", NA18138, NA18149, NA18152, \nNA18674, NA18707 \n Chinese in Denver, \nUSA \n[86] \nNA17971, NA18124, NA18550, NA18574, NA18582, \nNA18618, NA18636, NA18638, NA18639, NA18644, \nNA18756, NA18769, NA18771 \nHan Chinese in Beijing \n[86] \nNA18755 \nBeijing Han Chinese \n[86] \nNA18940, NA18943, NA18952, NA18953"
227
+ ],
228
+ [
229
+ "BRU18 (isolate of accession)",
230
+ "haplogroup",
231
+ "The region of haplogroup is inferred\nby using this source: Unknown",
232
+ "",
233
+ "Denver",
234
+ "Unknown",
235
+ ""
236
+ ],
237
+ [
238
+ "BRU18 (isolate of accession)",
239
+ "QAModel",
240
+ "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>",
241
+ "Sample ID not found",
242
+ "",
243
+ "",
244
+ "Sample ID not found."
245
+ ],
246
+ [
247
+ "BRU18 (isolate of accession)",
248
+ "haplogroup",
249
+ "The region of haplogroup is inferred\nby using this source: Unknown",
250
+ "",
251
+ "Sample",
252
+ "Unknown",
253
+ ""
254
+ ],
255
+ [
256
+ "MW291678 (accession number)",
257
+ "NCBI",
258
+ "NCBI",
259
+ "Argentina",
260
+ "",
261
+ "",
262
+ "/geo_loc_name=\"Argentina\""
263
+ ],
264
+ [
265
+ "MN006856 (accession number)",
266
+ "NCBI",
267
+ "NCBI",
268
+ "Not found",
269
+ "",
270
+ "",
271
+ "Not found"
272
+ ]
273
+ ],
274
+ "Summary_Text": "**MF362736.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**MF362738.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**MF362739.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**KU131308**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Brunei**: 1 times\n- **GenBank**: 1 times\n- **Borneo**: 1 times\n- **East Asia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Brunei** (mentioned 1 times)\n\n---\n\n**MW291678**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Argentina**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Argentina** (mentioned 1 times)\n\n---\n\n**MN006856**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Not found**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Not found** (mentioned 1 times)",
275
+ "Ancient_Modern_Flag": "**MF362736.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ Flagged as ancient due to keywords: tomb, skeleton from NCBI\n(/isolation_source=\"Tomb 6; skeleton 1\" /specimen_voucher=\"Kapan;Tomb 6; skeleton 1\" )\n\n---\n\n**MF362738.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:\nMixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site\n\n\n\n---\n\n**MF362739.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:\nMixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site\n\n\n\n---\n\n**KU131308**\n### \ud83c\udffa Ancient/Modern Flag\n**Modern**\n\n_Explanation:_ https://doi.org/10.1007/s00439-015-1620-z:\nMixed context, leaning modern due to: we analysed, new sequences, published data, sink population, genome-wide data\n\n\n\n---\n\n**MW291678**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ Flagged as ancient due to keywords: archaeological from NCBI\n(/isolation_source=\"archaeological human bone\" )\n\n---\n\n**MN006856**\n### \ud83c\udffa Ancient/Modern Flag\n****\n\n_Explanation:_ "
276
+ }None
output.txt ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === Detailed Results ===
2
+ MF362736.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
3
+ MF362736.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
4
+ MF362736.1 (accession number), haplogroup, The region of haplogroup is inferred
5
+ by using this source: Unknown, Sample, Unknown
6
+ rise396_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
7
+ rise396_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
8
+ by using this source: Unknown, Sample, Unknown
9
+ MF362738.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
10
+ MF362738.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
11
+ MF362738.1 (accession number), haplogroup, The region of haplogroup is inferred
12
+ by using this source: Unknown, Sample, Unknown
13
+ rise407_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
14
+ rise407_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
15
+ by using this source: Unknown, Sample, Unknown
16
+ MF362739.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
17
+ MF362739.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
18
+ MF362739.1 (accession number), haplogroup, The region of haplogroup is inferred
19
+ by using this source: Unknown, Sample, Unknown
20
+ rise408_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
21
+ rise408_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
22
+ by using this source: Unknown, Sample, Unknown
23
+ KU131308 (accession number), NCBI, NCBI, Brunei, /geo_loc_name="Brunei"
24
+ KU131308 (accession number), QAModel, <a href="https://doi.org/10.1007/s00439-015-1620-z" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1007/s00439-015-1620-z</a>, GenBank, t (unavailable at the start of this study). We performed whole-mtDNA sequencing as previously described (Torroni et al. 2001) using an ABI 48-capillary 3730 DNA Analyser (Taipei) an ABI 16-capillary 3130XL DNA Analyser (Leeds) and an ABI 16-capillary 3100 DNA Analyser (Porto). Details on the new and
25
+ KU131308 (accession number), haplogroup, The region of haplogroup is inferred
26
+ by using this source: EMPOP, M7, East Asia
27
+ KU131308 (accession number), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>, Sample ID not found, Sample ID not found.
28
+ KU131308 (accession number), haplogroup, The region of haplogroup is inferred
29
+ by using this source: Unknown, Sample, Unknown
30
+ KU131308 (accession number), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>, Sample ID not found, Sample ID not found.
31
+ KU131308 (accession number), haplogroup, The region of haplogroup is inferred
32
+ by using this source: Unknown, Sample, Unknown
33
+ BRU18 (isolate of accession), QAModel, <a href="https://doi.org/10.1007/s00439-015-1620-z" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1007/s00439-015-1620-z</a>, Sample ID not found, Sample ID not found.
34
+ BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
35
+ by using this source: Unknown, Sample, Unknown
36
+ BRU18 (isolate of accession), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>, Borneo, , NA18138, NA18149, NA18152,
37
+ NA18674, NA18707
38
+ Chinese in Denver,
39
+ USA
40
+ [86]
41
+ NA17971, NA18124, NA18550, NA18574, NA18582,
42
+ NA18618, NA18636, NA18638, NA18639, NA18644,
43
+ NA18756, NA18769, NA18771
44
+ Han Chinese in Beijing
45
+ [86]
46
+ NA18755
47
+ Beijing Han Chinese
48
+ [86]
49
+ NA18940, NA18943, NA18952, NA18953
50
+ BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
51
+ by using this source: Unknown, Denver, Unknown
52
+ BRU18 (isolate of accession), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>, Sample ID not found, Sample ID not found.
53
+ BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
54
+ by using this source: Unknown, Sample, Unknown
55
+ MW291678 (accession number), NCBI, NCBI, Argentina, /geo_loc_name="Argentina"
56
+ MN006856 (accession number), NCBI, NCBI, Not found, Not found
57
+
58
+ === Summary ===
59
+ **MF362736.1**
60
+ ### 🧭 Location Frequency Summary
61
+ After counting all predicted and inferred locations:
62
+
63
+ - **Armenia**: 1 times
64
+
65
+ **Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
66
+
67
+ ---
68
+
69
+ **MF362738.1**
70
+ ### 🧭 Location Frequency Summary
71
+ After counting all predicted and inferred locations:
72
+
73
+ - **Armenia**: 1 times
74
+
75
+ **Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
76
+
77
+ ---
78
+
79
+ **MF362739.1**
80
+ ### 🧭 Location Frequency Summary
81
+ After counting all predicted and inferred locations:
82
+
83
+ - **Armenia**: 1 times
84
+
85
+ **Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
86
+
87
+ ---
88
+
89
+ **KU131308**
90
+ ### 🧭 Location Frequency Summary
91
+ After counting all predicted and inferred locations:
92
+
93
+ - **Brunei**: 1 times
94
+ - **GenBank**: 1 times
95
+ - **Borneo**: 1 times
96
+ - **East Asia**: 1 times
97
+
98
+ **Final Suggested Location:** 🗺️ **Brunei** (mentioned 1 times)
99
+
100
+ ---
101
+
102
+ **MW291678**
103
+ ### 🧭 Location Frequency Summary
104
+ After counting all predicted and inferred locations:
105
+
106
+ - **Argentina**: 1 times
107
+
108
+ **Final Suggested Location:** 🗺️ **Argentina** (mentioned 1 times)
109
+
110
+ ---
111
+
112
+ **MN006856**
113
+ ### 🧭 Location Frequency Summary
114
+ After counting all predicted and inferred locations:
115
+
116
+ - **Not found**: 1 times
117
+
118
+ **Final Suggested Location:** 🗺️ **Not found** (mentioned 1 times)
119
+
120
+ === Ancient/Modern Flag ===
121
+ **MF362736.1**
122
+ ### 🏺 Ancient/Modern Flag
123
+ **Ancient**
124
+
125
+ _Explanation:_ Flagged as ancient due to keywords: tomb, skeleton from NCBI
126
+ (/isolation_source="Tomb 6; skeleton 1" /specimen_voucher="Kapan;Tomb 6; skeleton 1" )
127
+
128
+ ---
129
+
130
+ **MF362738.1**
131
+ ### 🏺 Ancient/Modern Flag
132
+ **Ancient**
133
+
134
+ _Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:
135
+ Mixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site
136
+
137
+
138
+
139
+ ---
140
+
141
+ **MF362739.1**
142
+ ### 🏺 Ancient/Modern Flag
143
+ **Ancient**
144
+
145
+ _Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:
146
+ Mixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site
147
+
148
+
149
+
150
+ ---
151
+
152
+ **KU131308**
153
+ ### 🏺 Ancient/Modern Flag
154
+ **Modern**
155
+
156
+ _Explanation:_ https://doi.org/10.1007/s00439-015-1620-z:
157
+ Mixed context, leaning modern due to: we analysed, new sequences, published data, sink population, genome-wide data
158
+
159
+
160
+
161
+ ---
162
+
163
+ **MW291678**
164
+ ### 🏺 Ancient/Modern Flag
165
+ **Ancient**
166
+
167
+ _Explanation:_ Flagged as ancient due to keywords: archaeological from NCBI
168
+ (/isolation_source="archaeological human bone" )
169
+
170
+ ---
171
+
172
+ **MN006856**
173
+ ### 🏺 Ancient/Modern Flag
174
+ ****
175
+
176
+ _Explanation:_
requirements.txt CHANGED
@@ -1,24 +1,29 @@
1
- gradio
2
- transformers
3
- torch
4
- pandas
5
- scikit-learn
6
- spacy
7
- pymupdf
8
- requests
9
- biopython
10
- bs4
11
- pdfreader
12
- tabula-py
13
- spire.doc
14
- Spire.XLS
15
- thefuzz
16
- wordsegment
17
- spacy
18
- spacy-lookups-data
19
- gensim
20
- xlrd>=2.0.1
21
- openpyxl
22
- gspread
23
- oauth2client
24
- nltk
 
 
 
 
 
 
1
+ biopython==1.85
2
+ bs4==0.0.2
3
+ gensim==4.3.3
4
+ gradio==5.29.0
5
+ gspread==6.2.0
6
+ gspread-dataframe==4.0.0
7
+ huggingface-hub==0.30.2
8
+ nltk==3.9.1
9
+ oauth2client==4.1.3
10
+ openai==1.76.2
11
+ openpyxl==3.1.5
12
+ pandas==2.2.2
13
+ pdfreader==0.1.15
14
+ PyMuPDF==1.25.5
15
+ pytest==8.3.5
16
+ requests==2.32.3
17
+ scikit-learn==1.6.1
18
+ scipy==1.13.1
19
+ spacy==3.8.5
20
+ spacy-lookups-data==1.0.5
21
+ spire-doc==13.4.6
22
+ Spire.Xls==14.12.0
23
+ statsmodels==0.14.4
24
+ tabula-py==2.10.0
25
+ thefuzz==0.22.1
26
+ torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl
27
+ transformers==4.51.3
28
+ wordsegment==1.3.1
29
+ xlrd==2.0.1
setup.sh CHANGED
@@ -1,8 +1,8 @@
1
- #!/bin/bash
2
-
3
- # Install EDirect tools and set up PATH
4
- yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
5
- echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
6
- export PATH=$HOME/edirect:$PATH
7
-
8
-
 
1
+ #!/bin/bash
2
+
3
+ # Install EDirect tools and set up PATH
4
+ yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
5
+ echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
6
+ export PATH=$HOME/edirect:$PATH
7
+
8
+
standardize_location.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import re
3
+
4
+ # Normalize input
5
+ def normalize_key(text):
6
+ return re.sub(r"[^a-z0-9]", "", text.strip().lower())
7
+
8
+ # Search for city/place (normal flow)
9
+ def get_country_from_geonames(city_name, username="vyphung"):
10
+ url = "http://api.geonames.org/searchJSON"
11
+ params = {
12
+ "q": city_name,
13
+ "maxRows": 1,
14
+ "username": username
15
+ }
16
+ try:
17
+ r = requests.get(url, params=params, timeout=5)
18
+ data = r.json()
19
+ if data.get("geonames"):
20
+ return data["geonames"][0]["countryName"]
21
+ except Exception as e:
22
+ print("GeoNames searchJSON error:", e)
23
+ return None
24
+
25
+ # Search for country info using alpha-2/3 codes or name
26
+ def get_country_from_countryinfo(input_code, username="vyphung"):
27
+ url = "http://api.geonames.org/countryInfoJSON"
28
+ params = {
29
+ "username": username
30
+ }
31
+ try:
32
+ r = requests.get(url, params=params, timeout=5)
33
+ data = r.json()
34
+ if data.get("geonames"):
35
+ input_code = input_code.strip().upper()
36
+ for country in data["geonames"]:
37
+ # Match against country name, country code (alpha-2), iso alpha-3
38
+ if input_code in [
39
+ country.get("countryName", "").upper(),
40
+ country.get("countryCode", "").upper(),
41
+ country.get("isoAlpha3", "").upper()
42
+ ]:
43
+ return country["countryName"]
44
+ except Exception as e:
45
+ print("GeoNames countryInfoJSON error:", e)
46
+ return None
47
+
48
+ # Combined smart lookup
49
+ def smart_country_lookup(user_input, username="vyphung"):
50
+ raw_input = user_input.strip()
51
+ normalized = re.sub(r"[^a-zA-Z0-9]", "", user_input).upper() # normalize for codes (no strip spaces!)
52
+
53
+ # Special case: if user writes "UK: London" → split and take main country part
54
+ if ":" in raw_input:
55
+ raw_input = raw_input.split(":")[0].strip() # only take "UK"
56
+ # First try as country code (if 2-3 letters or common abbreviation)
57
+ if len(normalized) <= 3:
58
+ if normalized.upper() in ["UK","U.K","U.K."]:
59
+ country = get_country_from_geonames(normalized.upper(), username=username)
60
+ if country:
61
+ return country
62
+ else:
63
+ country = get_country_from_countryinfo(raw_input, username=username)
64
+ if country:
65
+ return country
66
+ country = get_country_from_countryinfo(raw_input, username=username) # try full names
67
+ if country:
68
+ return country
69
+ # Otherwise, treat as city/place
70
+ country = get_country_from_geonames(raw_input, username=username)
71
+ if country:
72
+ return country
73
+
74
+ return "Not found"
upgradeClassify.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from nltk.tokenize import sent_tokenize, word_tokenize
4
+ import nltk
5
+ nltk.download('punkt_tab')
6
+ #import coreferee
7
+ import copy
8
+ from sentence_transformers import SentenceTransformer, util
9
+ from sklearn.cluster import DBSCAN
10
+ from sklearn.metrics.pairwise import cosine_distances
11
+ from collections import defaultdict
12
+ import numpy as np
13
+ #from mtdna_classifier import infer_fromQAModel
14
+ # 1. SENTENCE-BERT MODEL
15
+ # Step 1: Preprocess the text
16
+ def normalize_text(text):
17
+ # Normalize various separators to "-"
18
+ text = re.sub(r'\s*(–+|—+|--+>|–>|->|-->|to|→|➝|➔|➡)\s*', '-', text, flags=re.IGNORECASE)
19
+ # Fix GEN10GEN30 → GEN10-GEN30
20
+ text = re.sub(r'\b([a-zA-Z]+)(\d+)(\1)(\d+)\b', r'\1\2-\1\4', text)
21
+ # Fix GEN10-30 → GEN10-GEN30
22
+ text = re.sub(r'\b([a-zA-Z]+)(\d+)-(\d+)\b', r'\1\2-\1\3', text)
23
+ return text
24
+
25
+ def preprocess_text(text):
26
+ normalized = normalize_text(text)
27
+ sentences = sent_tokenize(normalized)
28
+ return [re.sub(r"[^a-zA-Z0-9\s\-]", "", s).strip() for s in sentences]
29
+
30
+ # Before step 2, check NLP cache to avoid calling it muliple times:
31
+ # Global model cache
32
+ _spacy_models = {}
33
+
34
+ def get_spacy_model(model_name, add_coreferee=False):
35
+ global _spacy_models
36
+ if model_name not in _spacy_models:
37
+ nlp = spacy.load(model_name)
38
+ if add_coreferee and "coreferee" not in nlp.pipe_names:
39
+ nlp.add_pipe("coreferee")
40
+ _spacy_models[model_name] = nlp
41
+ return _spacy_models[model_name]
42
+
43
+ # Step 2: NER to Extract Locations and Sample Names
44
+ def extract_entities(text, sample_id=None):
45
+ nlp = get_spacy_model("en_core_web_sm")
46
+ doc = nlp(text)
47
+
48
+ # Filter entities by GPE, but exclude things that match sample ID format
49
+ gpe_candidates = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
50
+
51
+ # Remove entries that match SAMPLE ID patterns like XXX123 or similar
52
+ gpe_filtered = [gpe for gpe in gpe_candidates if not re.fullmatch(r'[A-Z]{2,5}\d{2,4}', gpe.strip())]
53
+
54
+ # Optional: further filter known invalid patterns (e.g., things shorter than 3 chars, numeric only)
55
+ gpe_filtered = [gpe for gpe in gpe_filtered if len(gpe) > 2 and not gpe.strip().isdigit()]
56
+
57
+ if sample_id is None:
58
+ return list(set(gpe_filtered)), []
59
+ else:
60
+ sample_prefix = re.match(r'[A-Z]+', sample_id).group()
61
+ samples = re.findall(rf'{sample_prefix}\d+', text)
62
+ return list(set(gpe_filtered)), list(set(samples))
63
+
64
+ # Step 3: Build a Soft Matching Layer
65
+ # Handle patterns like "BRU1–BRU20" and identify BRU18 as part of it.
66
+ def is_sample_in_range(sample_id, sentence):
67
+ # Match prefix up to digits
68
+ sample_prefix_match = re.match(r'^([A-Z0-9]+?)(?=\d+$)', sample_id)
69
+ sample_number_match = re.search(r'(\d+)$', sample_id)
70
+
71
+ if not sample_prefix_match or not sample_number_match:
72
+ return False
73
+
74
+ sample_prefix = sample_prefix_match.group(1)
75
+ sample_number = int(sample_number_match.group(1))
76
+ sentence = normalize_text(sentence)
77
+ # Case 1: Full prefix on both sides
78
+ pattern1 = rf'{sample_prefix}(\d+)\s*-\s*{sample_prefix}(\d+)'
79
+ for match in re.findall(pattern1, sentence):
80
+ start, end = int(match[0]), int(match[1])
81
+ if start <= sample_number <= end:
82
+ return True
83
+
84
+ # Case 2: Prefix only on first number
85
+ pattern2 = rf'{sample_prefix}(\d+)\s*-\s*(\d+)'
86
+ for match in re.findall(pattern2, sentence):
87
+ start, end = int(match[0]), int(match[1])
88
+ if start <= sample_number <= end:
89
+ return True
90
+
91
+ return False
92
+
93
+ # Step 4: Use coreferree to merge the sentences have same coreference # still cannot cause packages conflict
94
+ # ========== HEURISTIC GROUP → LOCATION MAPPERS ==========
95
+ # === Generalized version to replace your old extract_sample_to_group_general ===
96
+ # === Generalized version to replace your old extract_group_to_location_general ===
97
+ def extract_population_locations(text):
98
+ text = normalize_text(text)
99
+ pattern = r'([A-Za-z ,\-]+)\n([A-Z]+\d*)\n([A-Za-z ,\-]+)\n([A-Za-z ,\-]+)'
100
+ pop_to_location = {}
101
+
102
+ for match in re.finditer(pattern, text, flags=re.IGNORECASE):
103
+ _, pop_code, region, country = match.groups()
104
+ pop_to_location[pop_code.upper()] = f"{region.strip()}\n{country.strip()}"
105
+
106
+ return pop_to_location
107
+
108
+ def extract_sample_ranges(text):
109
+ text = normalize_text(text)
110
+ # Updated pattern to handle punctuation and line breaks
111
+ pattern = r'\b([A-Z0-9]+\d+)[–\-]([A-Z0-9]+\d+)[,:\.\s]*([A-Z0-9]+\d+)\b'
112
+ sample_to_pop = {}
113
+ for match in re.finditer(pattern, text, flags=re.IGNORECASE):
114
+ start_id, end_id, pop_code = match.groups()
115
+ start_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', start_id, re.IGNORECASE).group(1).upper()
116
+ end_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', end_id, re.IGNORECASE).group(1).upper()
117
+ if start_prefix != end_prefix:
118
+ continue
119
+ start_num = int(re.search(r'(\d+)$', start_id).group())
120
+ end_num = int(re.search(r'(\d+)$', end_id).group())
121
+ for i in range(start_num, end_num + 1):
122
+ sample_id = f"{start_prefix}{i:03d}"
123
+ sample_to_pop[sample_id] = pop_code.upper()
124
+
125
+ return sample_to_pop
126
+
127
+ def filter_context_for_sample(sample_id, full_text, window_size=2):
128
+
129
+ # Normalize and tokenize
130
+ full_text = normalize_text(full_text)
131
+ sentences = sent_tokenize(full_text)
132
+
133
+ # Step 1: Find indices with direct mention or range match
134
+ match_indices = [
135
+ i for i, s in enumerate(sentences)
136
+ if sample_id in s or is_sample_in_range(sample_id, s)
137
+ ]
138
+
139
+ # Step 2: Get sample → group mapping from full text
140
+ sample_to_group = extract_sample_ranges(full_text)
141
+ group_id = sample_to_group.get(sample_id)
142
+
143
+ # Step 3: Find group-related sentences
144
+ group_indices = []
145
+ if group_id:
146
+ for i, s in enumerate(sentences):
147
+ if group_id in s:
148
+ group_indices.append(i)
149
+
150
+ # Step 4: Collect sentences within window
151
+ selected_indices = set()
152
+ if len(match_indices + group_indices) > 0:
153
+ for i in match_indices + group_indices:
154
+ start = max(0, i - window_size)
155
+ end = min(len(sentences), i + window_size + 1)
156
+ selected_indices.update(range(start, end))
157
+
158
+ filtered_sentences = [sentences[i] for i in sorted(selected_indices)]
159
+ return " ".join(filtered_sentences)
160
+ return full_text
161
+ # Load the SpaCy transformer model with coreferee
162
+ def mergeCorefSen(text):
163
+ sen = preprocess_text(text)
164
+ return sen
165
+
166
+ # Before step 5 and below, let check transformer cache to avoid calling again
167
+ # Global SBERT model cache
168
+ _sbert_models = {}
169
+
170
+ def get_sbert_model(model_name="all-MiniLM-L6-v2"):
171
+ global _sbert_models
172
+ if model_name not in _sbert_models:
173
+ _sbert_models[model_name] = SentenceTransformer(model_name)
174
+ return _sbert_models[model_name]
175
+
176
+ # Step 5: Sentence-BERT retriever → Find top paragraphs related to keyword.
177
+ '''Use sentence transformers to embed the sentence that mentions the sample and
178
+ compare it to sentences that mention locations.'''
179
+
180
+ def find_top_para(sample_id, text,top_k=5):
181
+ sentences = mergeCorefSen(text)
182
+ model = get_sbert_model("all-mpnet-base-v2")
183
+ embeddings = model.encode(sentences, convert_to_tensor=True)
184
+
185
+ # Find the sentence that best matches the sample_id
186
+ sample_matches = [s for s in sentences if sample_id in s or is_sample_in_range(sample_id, s)]
187
+ if not sample_matches:
188
+ return [],"No context found for sample"
189
+
190
+ sample_embedding = model.encode(sample_matches[0], convert_to_tensor=True)
191
+ cos_scores = util.pytorch_cos_sim(sample_embedding, embeddings)[0]
192
+
193
+ # Get top-k most similar sentence indices
194
+ top_indices = cos_scores.argsort(descending=True)[:top_k]
195
+ return top_indices, sentences
196
+
197
+ # Step 6: DBSCAN to cluster the group of similar paragraphs.
198
+ def clusterPara(tokens):
199
+ # Load Sentence-BERT model
200
+ sbert_model = get_sbert_model("all-mpnet-base-v2")
201
+ sentence_embeddings = sbert_model.encode(tokens)
202
+
203
+ # Compute cosine distance matrix
204
+ distance_matrix = cosine_distances(sentence_embeddings)
205
+
206
+ # DBSCAN clustering
207
+ clustering_model = DBSCAN(eps=0.3, min_samples=1, metric="precomputed")
208
+ cluster_labels = clustering_model.fit_predict(distance_matrix)
209
+
210
+ # Group sentences by cluster
211
+ clusters = defaultdict(list)
212
+ cluster_embeddings = defaultdict(list)
213
+ sentence_to_cluster = {}
214
+ for i, label in enumerate(cluster_labels):
215
+ clusters[label].append(tokens[i])
216
+ cluster_embeddings[label].append(sentence_embeddings[i])
217
+ sentence_to_cluster[tokens[i]] = label
218
+ # Compute cluster centroids
219
+ centroids = {
220
+ label: np.mean(embs, axis=0)
221
+ for label, embs in cluster_embeddings.items()
222
+ }
223
+ return clusters, sentence_to_cluster, centroids
224
+
225
+ def rankSenFromCluster(clusters, sentence_to_cluster, centroids, target_sentence):
226
+ target_cluster = sentence_to_cluster[target_sentence]
227
+ target_centroid = centroids[target_cluster]
228
+ sen_rank = []
229
+ sen_order = list(sentence_to_cluster.keys())
230
+ # Compute distances to other cluster centroids
231
+ dists = []
232
+ for label, centroid in centroids.items():
233
+ dist = cosine_distances([target_centroid], [centroid])[0][0]
234
+ dists.append((label, dist))
235
+ dists.sort(key=lambda x: x[1]) # sort by proximity
236
+ for d in dists:
237
+ cluster = clusters[d[0]]
238
+ for sen in cluster:
239
+ if sen != target_sentence:
240
+ sen_rank.append(sen_order.index(sen))
241
+ return sen_rank
242
+ # Step 7: Final Inference Wrapper
243
+ def infer_location_for_sample(sample_id, context_text):
244
+ # Go through each of the top sentences in order
245
+ top_indices, sentences = find_top_para(sample_id, context_text,top_k=5)
246
+ if top_indices==[] or sentences == "No context found for sample":
247
+ return "No clear location found in top matches"
248
+ clusters, sentence_to_cluster, centroids = clusterPara(sentences)
249
+ topRankSen_DBSCAN = []
250
+ mostTopSen = ""
251
+ locations = ""
252
+ i = 0
253
+ while len(locations) == 0 or i < len(top_indices):
254
+ # Firstly, start with the top-ranked Sentence-BERT result
255
+ idx = top_indices[i]
256
+ best_sentence = sentences[idx]
257
+ if i == 0:
258
+ mostTopSen = best_sentence
259
+ locations, _ = extract_entities(best_sentence, sample_id)
260
+ if locations:
261
+ return locations
262
+ # If no location, then look for sample overlap in the same DBSCAN cluster
263
+ # Compute distances to other cluster centroids
264
+ if len(topRankSen_DBSCAN)==0 and mostTopSen:
265
+ topRankSen_DBSCAN = rankSenFromCluster(clusters, sentence_to_cluster, centroids, mostTopSen)
266
+ if i >= len(topRankSen_DBSCAN): break
267
+ idx_DBSCAN = topRankSen_DBSCAN[i]
268
+ best_sentence_DBSCAN = sentences[idx_DBSCAN]
269
+ locations, _ = extract_entities(best_sentence, sample_id)
270
+ if locations:
271
+ return locations
272
+ # If no, then backtrack to next best Sentence-BERT sentence (such as 2nd rank sentence), and repeat step 1 and 2 until run out
273
+ i += 1
274
+ # Last resort: LLM (e.g. chatGPT, deepseek, etc.)
275
+ #if len(locations) == 0:
276
+ return "No clear location found in top matches"