Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on May 26

Commit

7fc87fe

verified ·

1 Parent(s): 0ac2f54

update new codes

Browse files

Files changed (37) hide show

DefaultPackages/__pycache__/__init__.cpython-310.pyc +0 -0
DefaultPackages/__pycache__/__init__.cpython-311.pyc +0 -0
DefaultPackages/__pycache__/openFile.cpython-310.pyc +0 -0
DefaultPackages/__pycache__/openFile.cpython-311.pyc +0 -0
DefaultPackages/__pycache__/saveFile.cpython-310.pyc +0 -0
DefaultPackages/__pycache__/saveFile.cpython-311.pyc +0 -0
NER/PDF/__pycache__/pdf.cpython-310.pyc +0 -0
NER/PDF/__pycache__/pdf.cpython-311.pyc +0 -0
NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc +0 -0
NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc +0 -0
NER/__pycache__/cleanText.cpython-310.pyc +0 -0
NER/__pycache__/cleanText.cpython-311.pyc +0 -0
NER/html/__pycache__/extractHTML.cpython-310.pyc +0 -0
NER/html/__pycache__/extractHTML.cpython-311.pyc +0 -0
NER/word2Vec/__pycache__/word2vec.cpython-310.pyc +0 -0
NER/word2Vec/__pycache__/word2vec.cpython-311.pyc +0 -0
NER/word2Vec/heuristic.py +52 -0
NER/word2Vec/testModel/test_model.model +3 -0
NER/word2Vec/testModel/test_model.txt +25 -0
NER/word2Vec/testModel/test_model_updated.model +3 -0
NER/word2Vec/word2vec.py +539 -101
README.md +74 -15
accessions.csv +6 -0
accessions.xlsx +0 -0
app.py +396 -36
data/user_fb/feedback_mtdna.xlsx +0 -0
env.yaml +8 -0
installedAndUsedRequirements.txt +637 -0
mtdna_backend.py +252 -0
mtdna_classifier.py +519 -322
mtdna_ui.py +210 -0
output.json +276 -0
output.txt +176 -0
requirements.txt +29 -24
setup.sh +8 -8
standardize_location.py +74 -0
upgradeClassify.py +276 -0

DefaultPackages/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/DefaultPackages/__pycache__/__init__.cpython-310.pyc and b/DefaultPackages/__pycache__/__init__.cpython-310.pyc differ

DefaultPackages/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/DefaultPackages/__pycache__/__init__.cpython-311.pyc and b/DefaultPackages/__pycache__/__init__.cpython-311.pyc differ

DefaultPackages/__pycache__/openFile.cpython-310.pyc CHANGED Viewed

Binary files a/DefaultPackages/__pycache__/openFile.cpython-310.pyc and b/DefaultPackages/__pycache__/openFile.cpython-310.pyc differ

DefaultPackages/__pycache__/openFile.cpython-311.pyc CHANGED Viewed

Binary files a/DefaultPackages/__pycache__/openFile.cpython-311.pyc and b/DefaultPackages/__pycache__/openFile.cpython-311.pyc differ

DefaultPackages/__pycache__/saveFile.cpython-310.pyc CHANGED Viewed

Binary files a/DefaultPackages/__pycache__/saveFile.cpython-310.pyc and b/DefaultPackages/__pycache__/saveFile.cpython-310.pyc differ

DefaultPackages/__pycache__/saveFile.cpython-311.pyc CHANGED Viewed

Binary files a/DefaultPackages/__pycache__/saveFile.cpython-311.pyc and b/DefaultPackages/__pycache__/saveFile.cpython-311.pyc differ

NER/PDF/__pycache__/pdf.cpython-310.pyc ADDED Viewed

Binary file (4.27 kB). View file

NER/PDF/__pycache__/pdf.cpython-311.pyc CHANGED Viewed

Binary files a/NER/PDF/__pycache__/pdf.cpython-311.pyc and b/NER/PDF/__pycache__/pdf.cpython-311.pyc differ

NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc ADDED Viewed

Binary file (3.76 kB). View file

NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc CHANGED Viewed

Binary files a/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc and b/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc differ

NER/__pycache__/cleanText.cpython-310.pyc ADDED Viewed

Binary file (3.42 kB). View file

NER/__pycache__/cleanText.cpython-311.pyc CHANGED Viewed

Binary files a/NER/__pycache__/cleanText.cpython-311.pyc and b/NER/__pycache__/cleanText.cpython-311.pyc differ

NER/html/__pycache__/extractHTML.cpython-310.pyc ADDED Viewed

Binary file (5.07 kB). View file

NER/html/__pycache__/extractHTML.cpython-311.pyc CHANGED Viewed

Binary files a/NER/html/__pycache__/extractHTML.cpython-311.pyc and b/NER/html/__pycache__/extractHTML.cpython-311.pyc differ

NER/word2Vec/__pycache__/word2vec.cpython-310.pyc ADDED Viewed

Binary file (7.81 kB). View file

NER/word2Vec/__pycache__/word2vec.cpython-311.pyc CHANGED Viewed

Binary files a/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc and b/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc differ

NER/word2Vec/heuristic.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import logging
+from datetime import datetime
+class HeuristicManager:
+    def __init__(self, model, log_file="heuristic_log.txt", min_similarity_threshold=0.5, min_new_data_len=50):
+        self.model = model
+        self.min_similarity_threshold = min_similarity_threshold
+        self.min_new_data_len = min_new_data_len
+        self.log_file = log_file
+        logging.basicConfig(filename=self.log_file, level=logging.INFO)
+    def log(self, message):
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        logging.info(f"[{timestamp}] {message}")
+        print(f"[{timestamp}] {message}")
+    def check_similarity(self, test_terms):
+        triggers = []
+        for term in test_terms:
+            try:
+                sim = self.model.wv.most_similar(term)[0][1]
+                if sim < self.min_similarity_threshold:
+                    triggers.append(f"Low similarity for '{term}': {sim}")
+            except KeyError:
+                triggers.append(f"'{term}' not in vocabulary")
+        return triggers
+    def check_metadata(self, metadata):
+        triggers = []
+        if any(keyword in str(metadata).lower() for keyword in ["haplogroup b", "eastasia", "asian"]):
+            triggers.append("Detected new haplogroup or regional bias: 'Asian' or 'B'")
+        return triggers
+    def check_new_data_volume(self, new_data):
+        if len(new_data) < self.min_new_data_len:
+            return ["Not enough new data to justify retraining"]
+        return []
+    def should_retrain(self, test_terms, new_data, metadata):
+        triggers = []
+        triggers += self.check_similarity(test_terms)
+        triggers += self.check_metadata(metadata)
+        triggers += self.check_new_data_volume(new_data)
+        if triggers:
+            self.log("Retraining triggered due to:")
+            for trigger in triggers:
+                self.log(f" - {trigger}")
+            return True
+        else:
+            self.log("No retraining needed.")
+            return False

NER/word2Vec/testModel/test_model.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:734185116a1d2099dba0d04efc0eb1b7e0e8213fe1259b57bbcb7aaac3cd46ea
+size 133

NER/word2Vec/testModel/test_model.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+24 100
+dna -0.0005385255 0.0002430238 0.005111818 0.009016951 -0.009293036 -0.007109866 0.0064572324 0.008987154 -0.0050192317 -0.0037659889 0.0073785 -0.0015431087 -0.0045221853 0.006557529 -0.004854595 -0.0018278129 0.002881375 0.0010002495 -0.00829578 -0.009462763 0.007312361 0.0050688535 0.0067577288 0.0007685764 0.006347226 -0.003397316 -0.0009421973 0.0057741464 -0.007532499 -0.0039303782 -0.0075064874 -0.0009439946 0.009533595 -0.0073319245 -0.002333888 -0.0019326513 0.0080786925 -0.005930193 3.549824e-05 -0.00475331 -0.0095964745 0.005000012 -0.008770563 -0.0043735923 -2.9246534e-05 -0.00030931013 -0.007669701 0.009599569 0.004982613 0.009233704 -0.008148657 0.004488859 -0.0041414667 0.00081141765 0.008487031 -0.00446156 0.0045125154 -0.006793622 -0.0035560841 0.009394251 -0.0015774865 0.00032431752 -0.004129968 -0.0076763057 -0.0015165819 0.0024841889 -0.00088440755 0.0055526863 -0.0027446826 0.002259023 0.0054701897 0.008356409 -0.0014508999 -0.009201209 0.004375452 0.00058271736 0.0074576377 -0.00080706284 -0.0026372937 -0.008752899 -0.00087625836 0.00282087 0.005398569 0.0070530027 -0.0057170955 0.0018605916 0.006099475 -0.0048024287 -0.003104349 0.0067992285 0.0016360026 0.00019302641 0.00348545 0.00021818833 0.009630539 0.0050670514 -0.008908632 -0.007042295 0.0009007676 0.0063867364
+from -0.00861988 0.0036778022 0.005193427 0.005744547 0.0074751326 -0.0061739217 0.0011082628 0.0060625207 -0.0028567386 -0.006184132 -0.00041290926 -0.008384168 -0.0055893976 0.007104685 0.003362318 0.007228353 0.0068033817 0.007533677 -0.003792071 -0.000581891 0.0023577819 -0.0045196284 0.008395244 -0.009858517 0.006761404 0.0029261683 -0.004930935 0.0043925527 -0.0017370671 0.006713542 0.009974645 -0.0043735756 -0.0006050642 -0.005716478 0.003858548 0.002799571 0.00690247 0.00610934 0.009526547 0.009269763 0.007910428 -0.007008808 -0.00916451 -0.00033672128 -0.0030898354 0.007890073 0.005923819 -0.001552973 0.001516021 0.0017856265 0.007822941 -0.009514211 -0.00020886083 0.0034666678 -0.00094713847 0.008384139 0.009009283 0.0065234327 -0.0007208324 0.007705209 -0.00853289 0.0032079336 -0.004625999 -0.0050743804 0.0035901158 0.005388813 0.007766254 -0.005744939 0.0074327383 0.006626378 -0.003704473 -0.008735958 0.005445474 0.0065230317 -0.000784768 -0.006700798 -0.007075852 -0.002488528 0.0051543443 -0.0036620772 -0.00938257 0.003815971 0.004890136 -0.0064404616 0.0012033634 -0.0020763231 2.994902e-05 -0.0098790005 0.002700701 -0.004756241 0.0011076172 -0.0015674155 0.0022046466 -0.00787344 -0.0027070795 0.002668326 0.0053478787 -0.002396734 -0.009512201 0.0045024394
+mtdna 8.645293e-05 0.003076037 -0.006815487 -0.0013743688 0.0076927417 0.0073529496 -0.0036715195 0.0026677884 -0.008309281 0.00619759 -0.00463892 -0.0031715294 0.009313415 0.00088058383 0.0074962615 -0.00608139 0.005167896 0.009930803 -0.008471472 -0.0051321597 -0.007057574 -0.0048644566 -0.003772668 -0.008518714 0.0079532955 -0.0048361127 0.008438283 0.005270068 -0.0065578814 0.0039592343 0.005482614 -0.007444929 -0.0074228924 -0.002492343 -0.008628872 -0.0015748737 -0.00038757667 0.0032959366 0.0014325404 -0.00088083016 -0.005591098 0.0017297626 -0.00089552783 0.0068030986 0.0039881677 0.004533183 0.0014284542 -0.0027126821 -0.0043595196 -0.0010315293 0.0014437438 -0.0026617546 -0.0070882514 -0.007825746 -0.009136036 -0.005931676 -0.001850123 -0.004323682 -0.0064626597 -0.0037265678 0.004296681 -0.0037233941 0.008404572 0.001539496 -0.007246572 0.009443451 0.007636867 0.0055208146 -0.0068550883 0.0058190743 0.004034045 0.005188155 0.0042629624 0.0019477821 -0.003167882 0.008342064 0.009619138 0.0038047181 -0.0028461283 5.6938893e-07 0.0012001555 -0.0084682545 -0.008234347 -0.00023238244 0.0012304098 -0.005750644 -0.0047139754 -0.0073490315 0.008316314 0.00010242269 -0.004513882 0.005704978 0.009199796 -0.004097329 0.007985275 0.005386452 0.0058861696 0.0005043713 0.008208188 -0.0070221694
+in -0.008226077 0.009303831 -0.00018710589 -0.0019704443 0.0046143015 -0.004104392 0.0027394402 0.006979235 0.0060486975 -0.0075411424 0.00939576 0.00465202 0.004012172 -0.006245291 0.008499353 -0.002164537 0.008836197 -0.005347778 -0.008136817 0.006804632 0.0016640095 -0.0022142953 0.009522269 0.009494823 -0.0097868545 0.0025105644 0.0061560757 0.0038842657 0.0020310257 0.00043876152 0.00068163266 -0.0038464246 -0.007141551 -0.0020813115 0.003930752 0.008838634 0.009274302 -0.0059668766 -0.009419525 0.009759848 0.0034291998 0.005158939 0.006265811 -0.0027623416 0.007310359 0.0027998323 0.0028576967 -0.0023982434 -0.003139742 -0.0023701421 0.0042809984 4.8589092e-05 -0.009614385 -0.00968607 -0.006160773 -0.00011437661 0.0019819876 0.009428 0.0056011924 -0.004298171 0.00026028603 0.004974084 0.007744428 -0.001135339 0.004278759 -0.0057750097 -0.0008068469 0.00811882 -0.002369315 -0.009674972 0.0058119837 -0.0039038642 -0.001220125 0.010017389 -0.002241946 -0.0047185957 -0.0053141676 0.0069846674 -0.005741993 0.002120917 -0.0052751247 0.00613608 0.0043662013 0.0026298608 -0.0015129133 -0.002735619 0.008999614 0.0052172863 -0.0021470466 -0.009465257 -0.007413552 -0.0010587372 -0.00078251073 -0.0025414668 0.009710779 -0.00044944565 0.005915 -0.007467981 -0.0024928953 -0.005583053
+european -0.007147033 0.0012623417 -0.007189088 -0.0022513974 0.0037773554 0.005857864 0.0012027922 0.0021598793 -0.004109796 0.007198152 -0.006319537 0.0046250015 -0.008186181 0.0020334523 -0.0049318667 -0.0042960607 -0.0030848773 0.0056965156 0.0057683894 -0.004991361 0.00076802005 -0.008515792 0.0078122346 0.009295911 -0.002746969 0.0008081935 0.0007694419 0.00550255 -0.008630911 0.0006062931 0.0068933573 0.0021813295 0.0010798875 -0.009366349 0.008471645 -0.006258249 -0.0029761735 0.0035168754 -0.00078163494 0.0014152499 0.0017921324 -0.006839617 -0.009737293 0.009092817 0.0062128166 -0.00694695 0.0033956417 0.00017217748 0.004755041 -0.0071203653 0.004067516 0.004303939 0.009927 -0.0045391554 -0.0014395243 -0.0073114103 -0.009704934 -0.009090646 -0.0010375449 -0.0065315044 0.0048550633 -0.006148244 0.0026037877 0.000752482 -0.0034296552 -0.00092229253 0.010017935 0.009206015 -0.004494388 0.009070265 -0.0055859834 0.0059493524 -0.0030818144 0.0034673577 0.003029479 0.0069394265 -0.0023470228 0.008820008 0.0075530927 -0.009551933 -0.008064042 -0.007652859 0.0029148757 -0.0027951996 -0.00694831 -0.008136711 0.008356287 0.0019903474 -0.00933717 -0.004817203 0.0031394493 -0.0046995636 0.005327329 -0.0042287502 0.0027155946 -0.008033582 0.0062630265 0.0047997306 0.00079031993 0.0029888113
+common -0.008722234 0.0021272295 -0.0008539916 -0.009321866 -0.0094246445 -0.001412531 0.0044288053 0.00372704 -0.006505282 -0.006894708 -0.0049991854 -0.0023061878 -0.007229156 -0.009607243 -0.0027377736 -0.008360431 -0.0060269493 -0.005675304 -0.00234906 -0.0017278373 -0.008954683 -0.000731004 0.008155364 0.007693106 -0.007208155 -0.003644954 0.0031189725 -0.009568674 0.0014795078 0.0065395026 0.0057490384 -0.008770905 -0.0045228535 -0.008156553 4.5400484e-05 0.00927559 0.005980464 0.0050585535 0.0050439127 -0.0032448657 0.009562716 -0.0073605715 -0.0072781076 -0.002255642 -0.00077679846 -0.0032283778 -0.00060498127 0.007476424 -0.00070291053 -0.0016193221 0.002749461 -0.008367007 0.0078366995 0.008528508 -0.009591924 0.0024459555 0.009891981 -0.007673955 -0.006969234 -0.0077365288 0.008389148 -0.00067644875 0.009162579 -0.008137346 0.0037369097 0.0026538277 0.0007320811 0.002340243 -0.007473436 -0.009367513 0.0023810826 0.0061679846 0.007993824 0.005740968 -0.00078188477 0.008307063 -0.009312772 0.0033975116 0.00027130058 0.003872196 0.007375048 -0.0067289495 0.005584901 -0.0095183 -0.0008194822 -0.008691651 -0.0050952802 0.009296191 -0.0018460032 0.0029113942 0.009088126 0.008946764 -0.008196811 -0.0030016953 0.009896215 0.005113277 -0.0015862831 -0.008699891 0.0029696936 -0.0066840183
+sequence 0.008134779 -0.0044588344 -0.0010699655 0.001010431 -0.00018677961 0.0011458534 0.0061133304 -1.2402037e-05 -0.0032534893 -0.0015101052 0.0058955555 0.0015073137 -0.0007181427 0.009341042 -0.004917502 -0.0008413052 0.009177319 0.0067567485 0.0015022643 -0.0088886535 0.0011522508 -0.0022903979 0.009365224 0.0012041465 0.0014943897 0.0024040388 -0.0018358674 -0.004996856 0.00023002276 -0.0020175653 0.0066060103 0.008935089 -0.0006746635 0.0029776676 -0.0061099143 0.0017025766 -0.006924371 -0.008690522 -0.005899618 -0.008961226 0.0072769034 -0.005776607 0.00827455 -0.007233702 0.003422895 0.009676102 -0.0077943387 -0.009949275 -0.0043248134 -0.0026828882 -0.0002740396 -0.008833413 -0.008620106 0.0027985822 -0.008205106 -0.009067738 -0.0023404285 -0.00863584 -0.007056119 -0.008398832 -0.0003011976 -0.0045611723 0.006630901 0.0015288803 -0.0033471577 0.006116343 -0.0060124504 -0.004648673 -0.0072044823 -0.0043340866 -0.0018032556 0.00649206 -0.0027680297 0.004921421 0.006912646 -0.007459126 0.004573438 0.006129695 -0.002956148 0.0066218316 0.006121442 -0.0064460207 -0.0067676785 0.002543585 -0.0016248615 -0.006062931 0.009498339 -0.005135456 -0.006549685 -0.000118091535 -0.002699267 0.00044816377 -0.0035289875 -0.00041692218 -0.00070437486 0.00083035015 0.0081978375 -0.005737508 -0.0016556873 0.005569238
+bru18 0.008155276 -0.0044185193 0.008987652 0.008259665 -0.0044238693 0.00031090993 0.004277394 -0.0039252234 -0.0055654007 -0.006509729 -0.0006656875 -0.00030213682 0.004489389 -0.0024855223 -0.00015437756 0.0024471143 0.0048732683 -2.8606542e-05 -0.0063628056 -0.009279111 1.8654398e-05 0.006667726 0.0014650559 -0.0089674555 -0.007945727 0.006548857 -0.0037690091 0.006254232 -0.0067004655 0.008482541 -0.0065189763 0.0032740948 -0.001067833 -0.0067885593 -0.0032949874 -0.0011434925 -0.005471747 -0.001204045 -0.0075744605 0.0026601462 0.009080238 -0.0023750134 -0.0009867329 0.0035252234 0.008680149 -0.0059299506 -0.006889695 -0.002942458 0.00913801 0.0008666254 -0.008663911 -0.001442217 0.009477263 -0.0075691855 -0.0053729587 0.009308613 -0.008970956 0.0038234547 0.00065334333 0.0066515543 0.008311967 -0.002862157 -0.003982641 0.008891435 0.0020839446 0.0062542376 -0.009450494 0.0095988605 -0.0013514485 -0.006062315 0.0029950105 -0.0004512243 0.0047055846 -0.0022705523 -0.004145877 0.0022992992 0.008370594 -0.004990823 0.0026696166 -0.00798221 -0.0067810714 -0.000469271 -0.008768882 0.0027844147 0.0015907697 -0.0023179457 0.005011737 0.009743466 0.008472866 -0.001870301 0.0020416898 -0.0039901678 -0.008234559 0.0062697986 -0.0019247098 -0.00066059735 -0.0017619281 -0.004536765 0.004069 -0.0042896206
+bru50 -0.009579504 0.008948466 0.0041579367 0.00923892 0.006649052 0.0029269105 0.009801864 -0.0044190143 -0.0068119396 0.004226486 0.0037328962 -0.005664456 0.009715384 -0.0035591167 0.009558758 0.00083636935 -0.006334789 -0.0019748765 -0.007390546 -0.002990235 0.0010405012 0.009480547 0.009361016 -0.0065955063 0.0034724285 0.0022746115 -0.0024764987 -0.009228658 0.0010185506 -0.008164371 0.0063289437 -0.0058100903 0.005530614 0.009826734 -0.00015984276 0.0045368825 -0.0018012718 0.0073676347 0.0039300686 -0.0090082595 -0.0023973046 0.0036249864 -0.00010732573 -0.0011888575 -0.0010430571 -0.0016724848 0.00059902505 0.0041630277 -0.004250072 -0.0038341933 -5.2427928e-05 0.00026678806 -0.00017553278 -0.0047934647 0.0043008197 -0.002173452 0.0020970574 0.00065915886 0.005959963 -0.0068526124 -0.00680708 -0.004473089 0.009448878 -0.001590459 -0.009438289 -0.000534792 -0.0044530216 0.0060103727 -0.009585406 0.002857136 -0.009246552 0.001258808 0.0059965253 0.0074065947 -0.007623657 -0.0060443347 -0.006831209 -0.007910946 -0.009496376 -0.0021281417 -0.0008362788 -0.007265241 0.0067816544 0.0011141741 0.0058228294 0.0014675015 0.00078702695 -0.007366497 -0.0021715113 0.0043177926 -0.005089294 0.001137756 0.0028883398 -0.0015285894 0.009943532 0.008348668 0.0024183327 0.007110643 0.005890512 -0.005592114
+vietnam -0.005153963 -0.0066644135 -0.007776157 0.0083126435 -0.0019782323 -0.006856599 -0.004155673 0.0051580225 -0.0028790692 -0.0037560624 0.0016262402 -0.00278304 -0.001570952 0.0010760438 -0.002967586 0.008515032 0.003917556 -0.009953211 0.0062494674 -0.0067655 0.00076895714 0.0043992978 -0.005096968 -0.0021128112 0.00809259 -0.0042428537 -0.0076304777 0.009258844 -0.0021577128 -0.004717085 0.008580298 0.004269408 0.004324098 0.009280228 -0.008452614 0.0052631963 0.0020472223 0.004193831 0.0016919046 0.004460046 0.0044873925 0.0060984488 -0.0032084621 -0.0045590503 -0.0004232687 0.002529075 -0.0032731881 0.006051339 0.0041546253 0.00776509 0.002568826 0.008108382 -0.0013972289 0.008070817 0.003707151 -0.008045609 -0.00393531 -0.0024772724 0.004889826 -0.00087688275 -0.00282919 0.007839672 0.009338199 -0.0016121961 -0.0051723607 -0.0046861414 -0.0048465827 -0.0095901145 0.0013706182 -0.0042283125 0.002539541 0.0056244545 -0.00406352 -0.009583576 0.0015531465 -0.006689678 0.0025049727 -0.0037749638 0.007073151 0.00063951715 0.0035553342 -0.0027433916 -0.001711565 0.007655947 0.0014000075 -0.005851 -0.007834303 0.0012315387 0.006458937 0.0055561876 -0.00897213 0.008598417 0.0040550055 0.007476387 0.00975736 -0.007282407 -0.009030263 0.0058277464 0.009392481 0.0034955258
+sample 0.007100903 -0.0015709094 0.007947078 -0.00948947 -0.00802812 -0.006650821 -0.004002562 0.00500194 -0.0038224515 -0.008330948 0.00841617 -0.0037529538 0.008619977 -0.004892141 0.003931126 0.004920354 0.0023956115 -0.0028135795 0.0028564015 -0.008257614 -0.0027645228 -0.0026008752 0.007249391 -0.0034709626 -0.0066022277 0.0043369113 -0.0004823991 -0.0035912786 0.006893536 0.003869671 -0.0038965137 0.0007677057 0.009145668 0.0077625574 0.0063656354 0.004670941 0.0023901698 -0.0018358309 -0.006370667 -0.00030689163 -0.0015674513 -0.00057719386 -0.0062623145 0.0074473424 -0.0066001806 -0.007243944 -0.0027626618 -0.0015170419 -0.007635178 0.0006969715 -0.005330137 -0.0012829994 -0.007370956 0.0019601034 0.003276234 -1.4737604e-05 -0.005451358 -0.001723771 0.00709824 0.003738 -0.008888436 -0.0034084066 0.0023648455 0.0021412992 -0.009477984 0.004583573 -0.008656226 -0.007383396 0.0034825006 -0.0034719554 0.0035707187 0.008896884 -0.003571185 0.009332037 0.0017215977 0.009857596 0.005704204 -0.009146731 -0.0033407472 0.0065290304 0.0055978918 0.008714949 0.0069304765 0.008049887 -0.009821734 0.004303451 -0.0050309277 0.0035138857 0.0060621244 0.0043927776 0.007520648 0.0014953684 -0.0012639741 0.0057787485 -0.0056348047 4.0551466e-05 0.009468461 -0.005486985 0.0038199269 -0.008121091
+collected 0.0097750295 0.008170629 0.0012814446 0.0051154387 0.0014172737 -0.006454876 -0.0014259414 0.0064561926 -0.004619688 -0.0039992593 0.004923175 0.0027045405 -0.0018415204 -0.0028716852 0.006021755 -0.005721393 -0.003250512 -0.0064803455 -0.0042360183 -0.008592084 -0.004467861 -0.008505252 0.0013975133 -0.008609542 -0.009919709 -0.008202052 -0.0067797694 0.006683116 0.0037784956 0.0003495915 -0.002959815 -0.007438984 0.0005348175 0.0005005026 0.00019596443 0.0008583165 0.00078985846 -5.4285138e-05 -0.008013045 -0.005872034 -0.00837931 -0.0013207265 0.0018039295 0.0074345516 -0.001966708 -0.0023440684 0.009481904 7.425008e-05 -0.0023982543 0.008607863 0.0026964454 -0.0053582233 0.0065950346 0.0045082304 -0.0070585674 -0.00031050213 0.00083163293 0.005739447 -0.0017207591 -0.0028131874 0.0017429565 0.00085032795 0.0012085037 -0.002637083 -0.0060016937 0.007339091 0.0075857476 0.00830421 -0.008602928 0.0026385786 -0.0035621128 0.0096288975 0.0029010975 0.004643974 0.0023910597 0.006626162 -0.005746352 0.007899223 -0.0024186398 -0.0045691207 -0.0020768652 0.009735589 -0.0068560173 -0.0021970137 0.006994984 -4.366915e-05 -0.0062879827 -0.006398747 0.008941079 0.0064397687 0.004773856 -0.003261329 -0.009269935 0.0038002136 0.0071752095 -0.0056398017 -0.007860231 -0.0029721109 -0.0049388385 -0.0023143636
+europe -0.0019466967 -0.005264445 0.009446078 -0.009301849 0.00450806 0.005410841 -0.0014122794 0.009008321 0.009883694 -0.0054709506 -0.0060238987 -0.006749262 -0.007891144 -0.0030501 -0.00559189 -0.008350158 0.000785714 0.002999436 0.0064088805 -0.0026336086 -0.0044599404 0.0012484614 0.00038998463 0.008114584 0.00018636887 0.0072303875 -0.008259172 0.008436813 -0.0018950498 0.008705898 -0.007616939 0.0017924334 0.0010528992 4.4615095e-05 -0.005109563 -0.009249746 -0.0072665187 -0.007951877 0.0019136231 0.00048003704 -0.0018163731 0.007123826 -0.0024782037 -0.0013449806 -0.008898934 -0.0099250255 0.008953352 -0.0057566464 -0.006378906 0.0052002883 0.0066733453 -0.0068328637 0.000956345 -0.0060142023 0.0016413335 -0.004295812 -0.0034417375 0.0021831726 0.008657248 0.0067267795 -0.00967649 -0.0056275628 0.007884859 0.0019889344 -0.0042598336 0.0006024022 0.009526292 -0.0011015745 -0.009430234 0.0016114928 0.0062343916 0.00628738 0.0040935944 -0.0056507527 -0.000374705 -4.9610684e-05 0.004579015 -0.0080420235 -0.008019654 0.0002663556 -0.008607854 0.005816331 -0.00042231655 0.00997148 -0.0053460747 -0.00048954826 0.0077552027 -0.004073562 -0.0050113807 0.0015921831 0.0026467363 -0.0025611357 0.006453244 -0.0076659652 0.003398472 0.00049256504 0.008736541 0.0059848153 0.006820848 0.007819741
+ancient -0.00949331 0.009558393 -0.0077741044 -0.0026378995 -0.0048897555 -0.0049655624 -0.008022211 -0.007766241 -0.0045622233 -0.0012816157 -0.0051147 0.0061208857 -0.009519694 -0.005296118 0.009434444 0.0069931676 0.0076746074 0.0042455657 0.0005105317 -0.0060022003 0.006030395 0.002638317 0.007692142 0.0063923756 0.0079497155 0.008663229 -0.009898174 -0.006753931 0.0013303582 0.0064388 0.0073839277 0.0055065546 0.007657052 -0.0051452103 0.006578382 -0.004109781 -0.009049926 0.009156881 0.0013312489 -0.0027684697 -0.0024686211 -0.004237798 0.004802247 0.00442113 -0.0026455545 -0.0073452652 -0.0035828727 -0.00034474322 0.006112652 -0.0028318586 -0.00011603545 0.0008713841 -0.007088451 0.0020616641 -0.0014378024 0.0028043352 0.0048393123 -0.0013679614 -0.0027919079 0.0077378284 0.005049118 0.006718327 0.0045309924 0.00867961 0.0074680797 -0.0010581953 0.008750674 0.0046186065 0.0054406407 -0.0013790869 -0.0020325198 -0.0044157715 -0.008505952 0.0030342783 0.008892043 0.0089222565 -0.0019243953 0.0060931933 0.0037896668 -0.0043041655 0.002026212 -0.005454141 0.008199508 0.005422219 0.003183278 0.0041012214 0.008660769 0.007268954 -0.0008326238 -0.0070764753 0.008396081 0.0072427383 0.0017482204 -0.0013339228 -0.0058783586 -0.004530154 0.008643081 -0.003131084 -0.006341318 0.009878559
+neanderthal 0.007692736 0.009126856 0.001134214 -0.008323363 0.008438394 -0.0036978398 0.005743373 0.0044079996 0.0096743805 -0.009301011 0.009201668 -0.009297726 -0.0068989955 -0.009099583 -0.0055382987 0.0073707746 0.009167804 -0.0033190295 0.0037136457 -0.0036417823 0.007886165 0.0058672884 4.5112392e-06 -0.0036315187 -0.0072244583 0.0047761244 0.0014634884 -0.002615084 0.007832942 -0.004045295 -0.00913638 -0.0022702827 0.00011177889 -0.006659164 -0.0054871286 -0.008484606 0.00924395 0.0074312175 -0.00030530593 0.0073675984 0.0079630045 -0.0007988404 0.0066030715 0.0037836921 0.0050928146 0.0072574555 -0.004751798 -0.0021930316 0.00087973 0.0042327694 0.0033078827 0.0050869007 0.004582786 -0.008444151 -0.0031969673 -0.007233252 0.009679768 0.0049946425 0.0001599608 0.0041068383 -0.0076482734 -0.0062929546 0.003092239 0.006544919 0.0039503933 0.006035828 -0.0019895614 -0.0033235473 0.00020525315 -0.0031931365 -0.005507259 -0.0077802544 0.0065467777 -0.0010795805 -0.0018928167 -0.007799526 0.009349405 0.00087477046 0.0017788016 0.0024914553 -0.0073950374 0.0016234348 0.0029714536 -0.008580277 0.0049522887 0.0024255016 0.0074964412 0.0050449395 -0.0030210917 -0.0071717766 0.007105708 0.0019140064 0.005210298 0.0063858717 0.0019259832 -0.0061174775 -5.528207e-06 0.008260976 -0.0060965912 0.009431074
+modern -0.0071792696 0.0042354544 0.00216289 0.007438057 -0.0048900596 -0.0045788498 -0.0060949842 0.0033097882 -0.004507435 0.008506253 -0.0042799306 -0.009108578 -0.0047961376 0.0064152437 -0.006351414 -0.0052630682 -0.007296127 0.006024725 0.003365447 0.0028487756 -0.0031356772 0.00602019 -0.0061529716 -0.001984372 -0.0059886468 -0.0009987217 -0.0020279228 0.008489572 9.179515e-05 -0.0085772425 -0.0054273363 -0.0068765874 0.0026914866 0.00946441 -0.0058075436 0.008274624 0.008538083 -0.007054826 -0.008883825 0.009470304 0.008378029 -0.0046964334 -0.0067229234 0.007853816 0.003754884 0.008087255 -0.0075793806 -0.009526273 0.0015759452 -0.009809055 -0.004886255 -0.003462314 0.009610498 0.008620381 -0.002831389 0.005837147 0.008235405 -0.002257783 0.009542199 0.0071611865 0.0020309114 -0.0038430467 -0.005072538 -0.00304804 0.007877576 -0.0061799455 -0.0029184332 0.009190523 0.003460949 0.0060627563 -0.008025261 -0.00075433304 0.0055211782 -0.0046972577 0.0074892025 0.009333807 -0.00041072394 -0.0020574103 -0.00060545607 -0.0057792794 -0.0083910655 -0.0014910942 -0.0025447267 0.0043934747 -0.006866489 0.00542165 -0.006739068 -0.0078106844 0.008480591 0.008917766 -0.0034737175 0.0034897032 -0.005797486 -0.008738294 -0.0055089584 0.0067478465 0.0064329007 0.009427363 0.007059985 0.0067415633
+human 0.0013073076 -0.009817197 0.0046000797 -0.00054215814 0.0063516907 0.0017917434 -0.0031376705 0.00779152 0.0015605913 4.5087592e-05 -0.004629277 -0.008477088 -0.0077653346 0.00868444 -0.0089293 0.009021215 -0.009282701 -0.00026340262 -0.0019013402 -0.008945062 0.008634705 0.006775237 0.0030073978 0.00484689 0.000119797296 0.009438227 0.007017406 -0.009846283 -0.0044378787 -0.0012810889 0.0030511408 -0.0043373024 0.0014413317 -0.007862512 0.002772104 0.0047001 0.004937028 -0.0031820575 -0.008430869 -0.009233454 -0.00072350266 -0.007335406 -0.0068239835 0.006137866 0.0071648457 0.0021028868 -0.00790615 -0.0057202103 0.008053211 0.0039317366 -0.0052275606 -0.007412702 0.00076265965 0.0034572822 0.002076003 0.0031028383 -0.0056280685 -0.0099016195 -0.0070258062 0.00023322599 0.0046109683 0.004535595 0.0018992841 0.0051839855 -0.000116945404 0.004136494 -0.009110944 0.0077172276 0.0061438708 0.0051303217 0.0072363587 0.0084579345 0.00074768433 -0.0017087719 0.0005303956 -0.009314834 0.008429295 -0.0063797934 0.008425091 -0.0042409054 0.0006248087 -0.009168093 -0.009569658 -0.007833339 -0.0077458574 0.00037962993 -0.0072201644 -0.004963075 -0.0052754995 -0.004289475 0.0070301695 0.004834569 0.008708495 0.0070971223 -0.0056847483 0.007253502 -0.009290819 -0.0025857396 -0.007757146 0.0042008474
+genome 0.0018013249 0.0070483726 0.002941503 -0.006984167 0.0077269375 -0.005990631 0.008982948 0.0029859466 -0.0040263417 -0.0046959417 -0.004423949 -0.006166649 0.009397486 -0.0026410713 0.00779025 -0.009682492 0.0021134273 -0.001217051 0.007545118 -0.009060286 0.007431912 -0.005112224 -0.006022511 -0.0056468663 -0.0033655176 -0.0034046597 -0.0031906026 -0.007475777 0.0007148267 -0.0005725245 -0.0016790004 0.0037438255 -0.00763313 -0.0032234066 0.00514847 0.00855509 -0.009791086 0.0071872775 0.0052953 -0.003874173 0.008570203 -0.009222292 0.0072385296 0.0053781155 0.0012898272 -0.0051951176 -0.004179599 -0.003369767 0.0015944163 0.001581598 0.007396833 0.0099602975 0.008836587 -0.004008733 0.009636086 -0.00063042255 0.0048575792 0.0025363516 -0.0006256454 0.0036644523 -0.005330011 -0.0057551167 -0.007577021 0.0019176035 0.006513916 0.00090115983 0.0012633507 0.0031810037 0.008123854 -0.007687061 0.0022752027 -0.007455608 0.003715618 0.009514587 0.0075186947 0.006441567 0.008026117 0.006552105 0.0068467325 0.00869257 -0.0049556913 0.009209661 0.0050575286 -0.0021248695 0.008474546 0.005080482 0.009641399 0.0028190457 0.009884555 0.001195692 0.009130684 0.0035973836 0.006580412 -0.00361116 0.0068057566 0.007250423 -0.002115621 -0.0018615718 0.003625693 -0.0070385
+shows 0.009741375 -0.009785563 -0.006502033 0.0027767855 0.0064354893 -0.005370729 0.0027519849 0.009131747 -0.006819064 -0.0061066505 -0.0049928115 -0.00368126 0.0018522884 0.009683641 0.00644354 0.00039165124 0.0024744181 0.00844649 0.009138178 0.005629969 0.005943013 -0.007629522 -0.0038295696 -0.005683565 0.0061836103 -0.00225932 -0.008786562 0.0076284255 0.008406309 -0.0033179314 0.009119112 -0.00073907804 -0.0036286868 -0.0003802314 0.00019241076 -0.0035078088 0.0028134247 0.005731432 0.006873956 -0.008905951 -0.0021951643 -0.0054816343 0.0075234827 0.0065075015 -0.0043688817 0.002324414 -0.0059516523 0.00023538349 0.00945961 -0.0026105444 -0.0051873005 -0.0074033006 -0.0029152564 -0.0008664178 0.0035291065 0.009743326 -0.0033921245 0.001903681 0.009692432 0.0015337794 0.0009810732 0.009802843 0.00930645 0.007710903 -0.006179333 0.009991138 0.005857104 0.009073708 -0.002001237 0.0033512171 0.0068392376 -0.0038913293 0.006648019 0.0025668114 0.009319553 -0.0030298685 -0.0031094935 0.0062168743 -0.00908894 -0.0072543155 -0.006503641 -0.00074380165 -0.002362113 0.0068256087 0.009239293 -0.00091146474 0.0014132133 0.002020571 -0.0020174456 -0.008035576 0.007445874 -0.004299319 0.004580612 0.009090945 0.0030486963 0.00313993 0.0040727276 -0.0027017219 0.0038345656 0.00033530922
+variation 0.005626712 0.005497371 0.0018291199 0.0057494068 -0.008968078 0.0065593575 0.009225992 -0.0042071473 0.0016075504 -0.0052338815 0.0010582185 0.0027701687 0.008160736 0.00054401276 0.0025570584 0.001297735 0.008402523 -0.0057077026 -0.00626183 -0.0036275184 -0.0023005498 0.005041063 -0.008120357 -0.0028335357 -0.008197427 0.00514971 -0.0025680638 -0.009067107 0.0040717293 0.009017323 -0.0030376601 -0.0058385395 0.0030198884 -0.00043584823 -0.009979436 0.008417704 -0.0073388875 -0.004930407 -0.002657081 -0.0054523144 0.00171651 0.009712814 0.0045722723 0.008088603 -0.00047045827 0.0006449234 -0.002668352 -0.008779561 0.0034313034 0.0020933736 -0.009421854 -0.004968437 -0.009734099 -0.0057197916 0.0040645422 0.008642861 0.00411165 0.0023884643 0.008144778 -0.0011192096 -0.0013977134 -0.008746823 -0.00012579202 -0.0025675725 0.00038607715 0.007279662 -0.0070414604 -0.0039464748 -0.0066646053 -0.0035441148 -0.0033158315 0.002137121 0.0033281683 -0.004957187 -0.0045462907 0.0011386942 0.0054534827 0.0053736498 -0.0029685367 -0.0042665256 -0.005616647 -0.00054498314 0.001946373 0.0015253461 0.0073525296 -0.0027333724 -6.592393e-05 -0.0055276332 -0.0011700654 -0.0077119637 -0.0009593296 0.0013096749 -0.008594744 0.0087485835 -0.009207866 -0.009624677 -0.008511624 0.0073132683 0.0054655685 0.009249462
+haplogroup 0.0025659278 0.00085168 -0.0025371916 0.00934742 0.0028080416 0.0041162586 -0.0011815964 0.00096541416 0.0066110776 -0.00074895076 0.0033208325 -0.00070219487 0.0052740807 0.003645613 0.0026175152 -0.0053456044 -0.004693721 0.004352339 -0.0059164464 -0.00020070269 -0.0006396672 0.0034715144 -0.008427317 0.0088428045 -0.0014485243 -0.005307692 0.0040584584 -0.001898596 -0.007778139 -0.0044734394 -0.0003679351 -0.0089815045 0.0005416724 0.002407686 -0.003227299 0.0025667753 0.0024930644 0.009990179 0.0014140693 0.0020159276 0.0027784512 -0.0020868885 -0.008718105 0.008073382 -0.0019698895 -0.009723993 -0.006550278 -0.0039781313 0.003948964 0.0050270366 0.0061098747 -0.006815141 0.00066107995 -0.0028290635 -0.0052407067 0.006984182 0.0039222264 -0.003121762 -0.008263934 -0.0051569464 -0.00065567193 0.0078113875 0.006122021 -0.008424067 -0.0096058855 0.0071855173 -0.0022900787 -0.0036282074 0.005704672 -0.0058300486 0.005136189 -0.00020829153 -0.0068513798 -0.00030139415 0.006364283 0.009325248 0.0022419153 0.0050703404 -0.0050120936 -0.0008110871 -0.005373588 0.0011743606 -0.0017981603 -0.0036161384 -0.0070382343 0.009639485 0.003012655 -0.0022897385 -0.0041911877 0.0076894285 -0.0064663296 0.0031200873 0.0008309826 0.008321212 0.0068888706 -0.0028947534 0.002593874 -0.0016730811 -0.009431767 -0.0026270088
+h 0.0013225824 0.0065497826 0.009982806 0.009062454 -0.0079781795 0.0065080435 -0.0057147983 -0.0009299061 0.00047654507 0.0065626903 0.0044563343 0.0045750956 0.0095022535 0.00038496728 -0.0060190535 -0.006347197 0.0064362343 -0.005219293 -0.002869563 0.004042792 -0.002286449 -0.006022882 -0.0023193487 0.0012384101 0.0021826315 0.0061027543 -0.005193723 0.003081824 0.0072158594 0.0022087328 0.0054155486 -0.004879429 0.0061283903 -0.007640156 0.0034881763 -0.009306421 -0.0025874602 -0.00905658 -0.0016061858 -0.005364485 -0.0039271545 0.0011356737 0.002771372 -0.0014860439 -0.008151553 -0.0059441784 0.00080055697 -0.0039708167 -0.009422841 -0.0007733177 0.0066586556 0.005949332 -0.0099333245 0.0030846666 -0.006018299 -0.009179041 0.00015740465 -0.0003979007 -0.006993792 -0.0063003623 -0.0024212876 0.0071041975 -0.0074873487 0.0077126683 -0.000499351 0.001135528 0.009489626 0.0047690077 -0.0035878688 0.00373115 0.0035563034 0.0063642766 7.750339e-05 -0.0044055916 0.001321394 -0.005388977 0.0014417345 0.004943775 0.0051506218 0.009180272 -0.0075472356 -0.005428668 0.0064623333 0.0013423576 -0.0066391225 0.0008783591 0.0027003903 -0.0025289776 -0.004963421 0.0049924683 0.009631416 -0.0073435763 -7.912599e-05 -0.0025523733 -0.0063192695 -0.001368983 -0.005227159 0.009048553 -0.005790704 0.003674939
+is -0.00023357147 0.004226683 0.0021067455 0.009996419 0.0006458492 -0.005461563 -0.0011838758 0.0020920378 -0.0033855627 -0.007853136 -0.005604329 -0.0067612384 0.006366702 0.0039265845 0.008232181 0.0065088123 -0.0061183744 0.002733512 0.008466464 0.0015833755 0.0030677342 0.0058010546 -0.008839754 0.009125629 0.0068226005 0.008512217 -0.0082233 0.0061861346 0.006626654 -0.0013528146 -0.0062799496 0.0053081806 -0.006868758 -0.005337174 0.0035091531 0.008081314 0.008700704 -0.0043939846 -0.0091931205 0.009603682 0.006290027 -0.0039766026 -0.008465367 -0.004691139 -0.0039542373 -0.0032808431 0.0008109401 -0.00030902817 -0.0031103012 -0.005998526 0.009428418 -0.004739384 -0.007274209 0.0076703983 0.0025008747 0.0086274175 -0.004468981 -0.0069012893 0.0009802914 -0.0011801491 -0.009394523 -0.0015968346 0.0030780574 0.006576642 0.0068287384 0.0032347892 -0.0044282703 -0.0018157784 -0.0039494233 0.0057785274 -0.006343468 0.002114367 -0.0013383601 -0.0057999003 -0.007236314 0.0058711045 -0.008345587 -0.00067066104 0.0028193784 0.00773521 -0.007315293 0.003294973 0.009805078 -0.0069755646 -0.003540081 0.005130921 0.005245436 0.0016209023 0.00797557 0.00082546985 0.0018813204 -0.0015988776 -0.008149317 0.0032639706 0.0019852505 -0.008730082 -0.0006569945 7.3046285e-05 -2.6318648e-06 0.008703764
+mitochondrial -0.002508221 -0.0059015388 0.007485539 -0.007257687 -0.008965709 -0.0017888069 -0.008367486 0.00039139786 0.0019467709 -0.0024699308 -0.00644677 -0.00032192905 -0.0010975264 0.0034935323 0.008127049 0.0058537317 0.008440359 -0.0089677265 0.00944024 -0.002368706 0.008696626 0.0023858226 0.0035850583 -0.0095805535 -0.009488111 0.008984071 -0.002896514 0.0028174375 0.0064166263 -0.00029972216 0.00971954 -0.0010352092 -0.009671927 -0.0070548807 -0.0010439103 -0.008674508 0.0074211163 0.0036188734 -0.00874913 0.008480371 0.008929614 0.0058477637 0.0069070626 -0.009568968 0.0004927428 -0.009223568 -0.0036663204 0.00025142074 -0.0002807199 0.0014672013 0.0032786338 0.0021258853 0.005320648 0.0075189634 -0.005886681 0.007957336 0.005991082 0.009785411 0.0046226517 -0.0033269909 -0.0037473391 -0.00062982703 -0.0016548736 0.009871284 0.0011211695 0.00400867 0.0034179776 -0.008850507 0.006720342 0.008190563 -0.0016650181 0.0023356378 -0.0064802184 -0.006126035 0.0082164975 -0.0030429186 0.0067422306 0.001552869 -0.0019822652 0.0030546081 -0.004023311 -0.0017839139 0.0013798403 0.004887597 -0.0014078929 0.0006583137 -0.007930928 0.00949345 -0.008762073 0.007072499 0.0039040898 -0.0069980817 -0.005295161 -0.007937933 -0.0051285303 0.00707022 0.009641066 0.0021544741 0.0006394228 0.009524309

NER/word2Vec/testModel/test_model_updated.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b1b785c79991b857b364ee9863985eaf845087efb1aa40a6b9cfae3b2a50012
+size 133

NER/word2Vec/word2vec.py CHANGED Viewed

@@ -1,3 +1,374 @@
 '''WORD TO VECTOR'''
 import pandas as pd
 import json
@@ -10,8 +381,11 @@ from gensim.test.utils import common_texts
 from gensim.models.word2vec import Word2Vec
 from gensim.scripts.glove2word2vec import glove2word2vec
 from gensim.test.utils import datapath, get_tmpfile
 import sys
 import subprocess
 # can try multiprocessing to run quicker
 import multiprocessing
 import copy
@@ -32,18 +406,19 @@ class word2Vec():
   def __init__(self, nameFile=None, modelName=None):
     self.nameFile = nameFile
     self.modelName = modelName
   def spacy_similarity(self, word):
     # when use word2vec, try medium or large is better
     # maybe try odc similarity?
-    nlp = spacy.load("en_core_web_lg")
-    doc = nlp(word)
     for token1 in doc:
       for token2 in doc:
         print(token1.text, token2.text, token1.similarity(token2))
     pass
   # clean text before transform to corpus
   def cleanTextBeforeCorpus(self,oriText, doi=None):
-    cl = cleanText.cleanGenText()
     #cl = cleanGenText()
     output = ""
     alreadyRemoveDoi = False
@@ -51,7 +426,7 @@ class word2Vec():
       # remove DOI
       if doi != None and doi in oriText:
         if alreadyRemoveDoi == False:
-          newWord = cl.removeDOI(word,doi)
           if len(newWord) > 0 and newWord != word:
             alreadyRemoveDoi = True
             word = newWord
@@ -59,13 +434,13 @@ class word2Vec():
       # split the sticked words
       #word = cl.splitStickWords(word)
       # remove punctuation
-      word = cl.removePunct(word,True)
       # remove URL
-      word = cl.removeURL(word)
       # remove HTMLTag
-      word = cl.removeHTMLTag(word)
       # remove tab, white space, newline
-      word = cl.removeTabWhiteSpaceNewLine(word)
       # optional: remove stopwords
       #word = cl.removeStopWords(word)
       if len(word)>0:
@@ -75,16 +450,18 @@ class word2Vec():
     cleanOutput = ""
     remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
     if len(allText) > 0:
-      corpusText = allText
-      for pos in range(len(corpusText.split("\n\n"))):
-        if len(corpusText.split("\n\n")[pos]) > 0:
-          lines = corpusText.split("\n\n")[pos]
           for line in lines.split("\n"):
             if remove in line:  line = line.replace(remove, "")
             clean_text = self.cleanTextBeforeCorpus(line, doi)
             cleanOutput += clean_text + "\n"
           cleanOutput += "\n\n"
     return cleanOutput
   def tableTransformToCorpusText(self, df, excelFile=None):
     # PDF, Excel, WordDoc
     #cl = cleanText.cleanGenText()
@@ -119,10 +496,10 @@ class word2Vec():
       try:
           df = pd.ExcelFile(excelFile)
       except:
-          if filepath.endswith('.xls'):
-            df = pd.read_excel(filepath, engine='xlrd')
           else:
-            df = pd.read_excel(filepath, engine='openpyxl')
       sheetNames = df.sheet_names
       output = []
       if len(sheetNames) > 0:
@@ -142,7 +519,7 @@ class word2Vec():
     return corpus
   def helperRowTableToCorpus(self, textList):
     #cl = cleanGenText()
-    cl = cleanText.cleanGenText()
     stopWords = ["NaN","Unnamed:","nan"]
     outputDF = []
     for line in textList:
@@ -154,9 +531,9 @@ class word2Vec():
             # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
             if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
               #word = cl.splitStickWords(word)
-              word = cl.removePunct(word)
-              word = " ".join(cl.removeStopWords(word))
-              word = cl.removeTabWhiteSpaceNewLine(word)
               if len(word) > 1:
                 if len(word.split(" ")) > 1:
                   for x in word.split(" "):
@@ -170,7 +547,7 @@ class word2Vec():
     return outputDF
   def helperColTableToCorpus(self, dfList):
     #cl = cleanGenText()
-    cl = cleanText.cleanGenText()
     stopWords = ["NaN","Unnamed:","nan"]
     outputDF = []
     # use the first length line as the column ref
@@ -186,9 +563,9 @@ class word2Vec():
             # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
             if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
               #word = cl.splitStickWords(word)
-              word = cl.removePunct(word)
-              word = " ".join(cl.removeStopWords(word))
-              word = cl.removeTabWhiteSpaceNewLine(word)
               if len(word) > 1:
                 if len(word.split(" ")) > 1:
                   for x in word.split(" "):
@@ -216,21 +593,22 @@ class word2Vec():
     Mouse is an animal.
     Jerry is mouse.'''
     texts = {}
-    cl = cleanText.cleanGenText()
     #cl = cleanGenText()
-    for pos in range(len(corpusText.split("\n\n"))):
-      if len(corpusText.split("\n\n")[pos]) > 0:
         texts["Paragraph "+str(pos)] = []
-        lines = corpusText.split("\n\n")[pos]
         for line in lines.split("\n"):
           for l in line.split("."):
             if len(l) > 0:
-              cl.removeTabWhiteSpaceNewLine(l)
               l = l.lower()
               newL = []
               for word in l.split(" "):
                 if len(word) > 0:
-                  word = cl.removeStopWords(word)
                   for w in word:
                     if len(w) > 0 and w.isnumeric()==False:
                       newL.append(w)
@@ -239,49 +617,86 @@ class word2Vec():
         if len(texts["Paragraph "+str(pos)]) == 0:
           del texts["Paragraph "+str(pos)]
     return texts
-  def selectParaForWC(self,corpus):
-    ''' corpus should be in the format:
-    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
-    corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
     corSize = len(corpus)
-    # less than 2000
-    if 0 < corSize < 2000:
-      window=3.5
-      vector_size=75
-      sample=1e-3
-      negative=10
-      epochs=10
-      sg=1
-    # 2000 - 100000
-    elif 2000 <= corSize < 100000:
-      window=3.5
-      vector_size=75
-      sample=1e-5
-      negative=10
-      epochs=10
-      sg=1
-    elif 100000 <=corSize < 1000000:
-      window=7.5
-      vector_size=150
-      sample=1e-5
-      negative=10
-      epochs=6
-      sg=0
     return window, vector_size, sample, negative, epochs, sg
-  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
-                    vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
-    # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
     jsonFile = ""
     jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
     cores = multiprocessing.cpu_count()
     combinedCorpus = []
-    window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
-    if len(jsonFile) > 0:
-      for key in jsonFile:
-        combinedCorpus.extend(jsonFile[key])
       window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
-      # # min_count=1 ensures all words are included
-      '''w2vModel = Word2Vec(
                           min_count=1,
                           window=window,
                           vector_size=vector_size,
@@ -291,43 +706,39 @@ class word2Vec():
                           negative=negative,
                           workers=cores-1,
                           epochs = epochs,
-                          sg=sg)'''
-      #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
-      accept = False
-      while not accept:
-        if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
-          try:
-            w2vModel = Word2Vec(
-                            min_count=1,
-                            window=window,
-                            vector_size=vector_size,
-                            sample=sample,
-                            alpha=0.03,
-                            min_alpha=0.0007,
-                            negative=negative,
-                            workers=cores-1,
-                            epochs = epochs,
-                            sg=sg)
-            w2vModel.build_vocab(combinedCorpus)
-            w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
-            accept = True
-          except:
-            for key in jsonFile:
-              combinedCorpus.extend(jsonFile[key])
-            window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
-            print("next is " + str(len(combinedCorpus)))
-        else:
-          print("no parameter to train")
-          break
-      #w2vModel.build_vocab(combinedCorpus)
-      #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
-      #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
-      #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
-      w2vModel.save(saveFolder+"/"+modelName+".model")
-      w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
-      print("done w2v")
-    else: print("no corpus to train")
     #return combinedCorpus
   def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
     # might not be a meaningful keyword
     #stopWords = ["show"]
@@ -354,6 +765,32 @@ class word2Vec():
           results.append(moreNewResult)
       currN +=1'''
     return results
   # adding our model into spacy
   # this deals with command line; but instead of using it, we write python script to run command line
   def loadWordVec(self,modelName,wordVec):
@@ -367,4 +804,5 @@ class word2Vec():
                     modelName, # this modelName comes from the saved modelName of function trainWord2Vec
                     "--vectors-loc",
                     wordVec])
     print("done")

+<<<<<<< HEAD
+'''WORD TO VECTOR'''
+import pandas as pd
+import json
+import gensim
+import spacy
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.test.utils import common_texts
+from gensim.models.word2vec import Word2Vec
+from gensim.scripts.glove2word2vec import glove2word2vec
+from gensim.test.utils import datapath, get_tmpfile
+import sys
+import subprocess
+# can try multiprocessing to run quicker
+import multiprocessing
+import copy
+sys.setrecursionlimit(1000)
+# creat folder word2Vec
+#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
+# create word2vec model
+#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
+'''Some notes for this model
+sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
+a similar word to the word we are finding, so can we try to preprocess text so that
+we make the corpus more effective and only contains the important words. Then when we
+train the model, the important words will be seen as important. Or
+when we already have the similar list of words, we can remove the words in there
+that are stopwords/unnecessary words.'''
+### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
+class word2Vec():
+  def __init__(self, nameFile=None, modelName=None):
+    self.nameFile = nameFile
+    self.modelName = modelName
+  def spacy_similarity(self, word):
+    # when use word2vec, try medium or large is better
+    # maybe try odc similarity?
+    nlp = spacy.load("en_core_web_lg")
+    doc = nlp(word)
+    for token1 in doc:
+      for token2 in doc:
+        print(token1.text, token2.text, token1.similarity(token2))
+    pass
+  # clean text before transform to corpus
+  def cleanTextBeforeCorpus(self,oriText, doi=None):
+    cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    output = ""
+    alreadyRemoveDoi = False
+    for word in oriText.split(" "):
+      # remove DOI
+      if doi != None and doi in oriText:
+        if alreadyRemoveDoi == False:
+          newWord = cl.removeDOI(word,doi)
+          if len(newWord) > 0 and newWord != word:
+            alreadyRemoveDoi = True
+            word = newWord
+      # remove punctuation
+      # split the sticked words
+      #word = cl.splitStickWords(word)
+      # remove punctuation
+      word = cl.removePunct(word,True)
+      # remove URL
+      word = cl.removeURL(word)
+      # remove HTMLTag
+      word = cl.removeHTMLTag(word)
+      # remove tab, white space, newline
+      word = cl.removeTabWhiteSpaceNewLine(word)
+      # optional: remove stopwords
+      #word = cl.removeStopWords(word)
+      if len(word)>0:
+        output += word + " "
+    return output
+  def cleanAllTextBeforeCorpus(self, allText, doi=None):
+    cleanOutput = ""
+    remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
+    if len(allText) > 0:
+      corpusText = allText
+      for pos in range(len(corpusText.split("\n\n"))):
+        if len(corpusText.split("\n\n")[pos]) > 0:
+          lines = corpusText.split("\n\n")[pos]
+          for line in lines.split("\n"):
+            if remove in line:  line = line.replace(remove, "")
+            clean_text = self.cleanTextBeforeCorpus(line, doi)
+            cleanOutput += clean_text + "\n"
+          cleanOutput += "\n\n"
+    return cleanOutput
+  def tableTransformToCorpusText(self, df, excelFile=None):
+    # PDF, Excel, WordDoc
+    #cl = cleanText.cleanGenText()
+    corpus = {}
+      # PDF or df
+    if excelFile == None:
+      if len(df) > 0:
+        try:
+          for i in range(len(df)):
+            # each new dimension/page is considered to be a sentence which ends with the period.
+            # each new line is a new list, and each new df is a new corpus
+            outputDF = []
+            text = df[i].values.tolist()
+            if len(text) > 0:
+              outputRowDF = self.helperRowTableToCorpus(text)
+              #outputColDF = self.helperColTableToCorpus(text)
+              outputDF.extend(outputRowDF)
+              #outputDF.extend(outputColDF)
+            if len(outputDF) > 0:
+              corpus["corpus" + str(i)] = outputDF
+        except:
+          outputDF = []
+          text = df.values.tolist()
+          if len(text) > 0:
+            outputRowDF = self.helperRowTableToCorpus(text)
+            #outputColDF = self.helperColTableToCorpus(text)
+            outputDF.extend(outputRowDF)
+            #outputDF.extend(outputColDF)
+          if len(outputDF) > 0:
+            corpus["corpus0"] = outputDF
+    else:
+      try:
+          df = pd.ExcelFile(excelFile)
+      except:
+          if filepath.endswith('.xls'):
+            df = pd.read_excel(filepath, engine='xlrd')
+          else:
+            df = pd.read_excel(filepath, engine='openpyxl')
+      sheetNames = df.sheet_names
+      output = []
+      if len(sheetNames) > 0:
+        for s in range(len(sheetNames)):
+          outputDF = []
+          with pd.ExcelFile(excelFile) as xls:
+            data = pd.read_excel(xls, sheetNames[s])
+          if sheetNames[s] != 'Evaluation Warning':
+            text = data.values.tolist()
+            if len(text) > 0:
+              outputRowDF = self.helperRowTableToCorpus(text)
+              #outputColDF = self.helperColTableToCorpus(text)
+              outputDF.extend(outputRowDF)
+              #outputDF.extend(outputColDF)
+          if len(outputDF) > 0:
+            corpus["corpus" + str(s)] = outputDF
+    return corpus
+  def helperRowTableToCorpus(self, textList):
+    #cl = cleanGenText()
+    cl = cleanText.cleanGenText()
+    stopWords = ["NaN","Unnamed:","nan"]
+    outputDF = []
+    for line in textList:
+      outputLine = []
+      for words in line:
+        words = str(words)
+        if len(words) > 0:
+          for word in words.split(" "):
+            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
+            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
+              #word = cl.splitStickWords(word)
+              word = cl.removePunct(word)
+              word = " ".join(cl.removeStopWords(word))
+              word = cl.removeTabWhiteSpaceNewLine(word)
+              if len(word) > 1:
+                if len(word.split(" ")) > 1:
+                  for x in word.split(" "):
+                    if len(x) > 1 and x.isnumeric()==False:
+                      outputLine.append(x.lower())
+                else:
+                  if word.isnumeric() == False:
+                    outputLine.append(word.lower())
+      if len(outputLine) > 0:
+        outputDF.append(outputLine)
+    return outputDF
+  def helperColTableToCorpus(self, dfList):
+    #cl = cleanGenText()
+    cl = cleanText.cleanGenText()
+    stopWords = ["NaN","Unnamed:","nan"]
+    outputDF = []
+    # use the first length line as the column ref
+    for pos in range(len(dfList[0])):
+      outputLine = []
+      for line in dfList:
+        if pos < len(line):
+          words = line[pos]
+          words = str(words)
+        else: words = ""
+        if len(words) > 0:
+          for word in words.split(" "):
+            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
+            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
+              #word = cl.splitStickWords(word)
+              word = cl.removePunct(word)
+              word = " ".join(cl.removeStopWords(word))
+              word = cl.removeTabWhiteSpaceNewLine(word)
+              if len(word) > 1:
+                if len(word.split(" ")) > 1:
+                  for x in word.split(" "):
+                    if len(x) > 1 and x.isnumeric()==False:
+                      outputLine.append(x.lower())
+                else:
+                  if word.isnumeric() == False:
+                    outputLine.append(word.lower())
+      if len(outputLine) > 0:
+        outputDF.append(outputLine)
+    return outputDF
+  # create a corpus
+  def createCorpusText(self, corpusText):
+    '''ex: "Tom is cat. Jerry is mouse."
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
+    # the output should be like this:
+    '''texts = {
+      "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
+      "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
+    }
+    '''
+    # separate paragraph
+    '''Ex: Cat is an animal. Tom is cat.
+    Mouse is an animal.
+    Jerry is mouse.'''
+    texts = {}
+    cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    for pos in range(len(corpusText.split("\n\n"))):
+      if len(corpusText.split("\n\n")[pos]) > 0:
+        texts["Paragraph "+str(pos)] = []
+        lines = corpusText.split("\n\n")[pos]
+        for line in lines.split("\n"):
+          for l in line.split("."):
+            if len(l) > 0:
+              cl.removeTabWhiteSpaceNewLine(l)
+              l = l.lower()
+              newL = []
+              for word in l.split(" "):
+                if len(word) > 0:
+                  word = cl.removeStopWords(word)
+                  for w in word:
+                    if len(w) > 0 and w.isnumeric()==False:
+                      newL.append(w)
+              if len(newL)>0:
+                texts["Paragraph "+str(pos)].append(newL)
+        if len(texts["Paragraph "+str(pos)]) == 0:
+          del texts["Paragraph "+str(pos)]
+    return texts
+  def selectParaForWC(self,corpus):
+    ''' corpus should be in the format:
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
+    corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
+    corSize = len(corpus)
+    # less than 2000
+    if 0 < corSize < 2000:
+      window=3.5
+      vector_size=75
+      sample=1e-3
+      negative=10
+      epochs=10
+      sg=1
+    # 2000 - 100000
+    elif 2000 <= corSize < 100000:
+      window=3.5
+      vector_size=75
+      sample=1e-5
+      negative=10
+      epochs=10
+      sg=1
+    elif 100000 <=corSize < 1000000:
+      window=7.5
+      vector_size=150
+      sample=1e-5
+      negative=10
+      epochs=6
+      sg=0
+    return window, vector_size, sample, negative, epochs, sg
+  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
+                    vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
+    # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
+    jsonFile = ""
+    jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
+    cores = multiprocessing.cpu_count()
+    combinedCorpus = []
+    window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
+    if len(jsonFile) > 0:
+      for key in jsonFile:
+        combinedCorpus.extend(jsonFile[key])
+      window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
+      # # min_count=1 ensures all words are included
+      '''w2vModel = Word2Vec(
+                          min_count=1,
+                          window=window,
+                          vector_size=vector_size,
+                          sample=sample,
+                          alpha=0.03,
+                          min_alpha=0.0007,
+                          negative=negative,
+                          workers=cores-1,
+                          epochs = epochs,
+                          sg=sg)'''
+      #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
+      accept = False
+      while not accept:
+        if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
+          try:
+            w2vModel = Word2Vec(
+                            min_count=1,
+                            window=window,
+                            vector_size=vector_size,
+                            sample=sample,
+                            alpha=0.03,
+                            min_alpha=0.0007,
+                            negative=negative,
+                            workers=cores-1,
+                            epochs = epochs,
+                            sg=sg)
+            w2vModel.build_vocab(combinedCorpus)
+            w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
+            accept = True
+          except:
+            for key in jsonFile:
+              combinedCorpus.extend(jsonFile[key])
+            window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
+            print("next is " + str(len(combinedCorpus)))
+        else:
+          print("no parameter to train")
+          break
+      #w2vModel.build_vocab(combinedCorpus)
+      #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
+      #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
+      #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
+      w2vModel.save(saveFolder+"/"+modelName+".model")
+      w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
+      print("done w2v")
+    else: print("no corpus to train")
+    #return combinedCorpus
+  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
+    # might not be a meaningful keyword
+    #stopWords = ["show"]
+    # same word but just plural nouns, tense
+    simWords = [word+"s",word+"es",word+"ing",word+"ed"]
+    model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
+    results = model.most_similar(positive=[word],topn=n)
+    #removeIndex = []
+    #currN = copy.deepcopy(n)
+    '''for r in range(len(results)):
+      if len(results[r][0]) < 2:
+        removeIndex.append(results[r])
+      # remove the same word but just plural and singular noun and lower than the cos_thres
+      elif results[r][0] == word:
+        removeIndex.append(results[r])
+      elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
+        removeIndex.append(results[r])
+    for rem in removeIndex:
+      results.remove(rem)
+    while len(results)!=n and len(results) != 0:
+      moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
+      if moreNewResult not in results and len(moreNewResult[0])>1:
+        if moreNewResult[0] not in stopWords and results[0] != word:
+          results.append(moreNewResult)
+      currN +=1'''
+    return results
+  # adding our model into spacy
+  # this deals with command line; but instead of using it, we write python script to run command line
+  def loadWordVec(self,modelName,wordVec):
+    # modelName is the name you want to save into spacy
+    # wordVec is the trained word2vec in txt format
+    subprocess.run([sys.executable,
+                    "-m",
+                    "spacy",
+                    "init-model",
+                    "en",
+                    modelName, # this modelName comes from the saved modelName of function trainWord2Vec
+                    "--vectors-loc",
+                    wordVec])
+=======
 '''WORD TO VECTOR'''
 import pandas as pd
 import json
 from gensim.models.word2vec import Word2Vec
 from gensim.scripts.glove2word2vec import glove2word2vec
 from gensim.test.utils import datapath, get_tmpfile
+from gensim.models import Phrases
+from gensim.models.phrases import Phraser
 import sys
 import subprocess
+import os
 # can try multiprocessing to run quicker
 import multiprocessing
 import copy
   def __init__(self, nameFile=None, modelName=None):
     self.nameFile = nameFile
     self.modelName = modelName
+    #self.nlp = spacy.load("en_core_web_lg")
+    self.cl = cleanText.cleanGenText()
   def spacy_similarity(self, word):
     # when use word2vec, try medium or large is better
     # maybe try odc similarity?
+    doc = self.nlp(word)
     for token1 in doc:
       for token2 in doc:
         print(token1.text, token2.text, token1.similarity(token2))
     pass
   # clean text before transform to corpus
   def cleanTextBeforeCorpus(self,oriText, doi=None):
+    #cl = cleanText.cleanGenText()
     #cl = cleanGenText()
     output = ""
     alreadyRemoveDoi = False
       # remove DOI
       if doi != None and doi in oriText:
         if alreadyRemoveDoi == False:
+          newWord = self.cl.removeDOI(word,doi)
           if len(newWord) > 0 and newWord != word:
             alreadyRemoveDoi = True
             word = newWord
       # split the sticked words
       #word = cl.splitStickWords(word)
       # remove punctuation
+      word = self.cl.removePunct(word,True)
       # remove URL
+      word = self.cl.removeURL(word)
       # remove HTMLTag
+      word = self.cl.removeHTMLTag(word)
       # remove tab, white space, newline
+      word = self.cl.removeTabWhiteSpaceNewLine(word)
       # optional: remove stopwords
       #word = cl.removeStopWords(word)
       if len(word)>0:
     cleanOutput = ""
     remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
     if len(allText) > 0:
+      corpusText = allText.split("\n\n")
+      for pos in range(len(corpusText)):
+        lines = corpusText[pos]
+        if len(lines) > 0:
           for line in lines.split("\n"):
             if remove in line:  line = line.replace(remove, "")
             clean_text = self.cleanTextBeforeCorpus(line, doi)
             cleanOutput += clean_text + "\n"
           cleanOutput += "\n\n"
     return cleanOutput
+  import urllib.parse, requests
   def tableTransformToCorpusText(self, df, excelFile=None):
     # PDF, Excel, WordDoc
     #cl = cleanText.cleanGenText()
       try:
           df = pd.ExcelFile(excelFile)
       except:
+          if excelFile.endswith('.xls'):
+            df = pd.read_excel(excelFile, engine='xlrd')
           else:
+            df = pd.read_excel(excelFile, engine='openpyxl')
       sheetNames = df.sheet_names
       output = []
       if len(sheetNames) > 0:
     return corpus
   def helperRowTableToCorpus(self, textList):
     #cl = cleanGenText()
+    #cl = cleanText.cleanGenText()
     stopWords = ["NaN","Unnamed:","nan"]
     outputDF = []
     for line in textList:
             # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
             if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
               #word = cl.splitStickWords(word)
+              word = self.cl.removePunct(word)
+              word = " ".join(self.cl.removeStopWords(word))
+              word = self.cl.removeTabWhiteSpaceNewLine(word)
               if len(word) > 1:
                 if len(word.split(" ")) > 1:
                   for x in word.split(" "):
     return outputDF
   def helperColTableToCorpus(self, dfList):
     #cl = cleanGenText()
+    #cl = cleanText.cleanGenText()
     stopWords = ["NaN","Unnamed:","nan"]
     outputDF = []
     # use the first length line as the column ref
             # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
             if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
               #word = cl.splitStickWords(word)
+              word = self.cl.removePunct(word)
+              word = " ".join(self.cl.removeStopWords(word))
+              word = self.cl.removeTabWhiteSpaceNewLine(word)
               if len(word) > 1:
                 if len(word.split(" ")) > 1:
                   for x in word.split(" "):
     Mouse is an animal.
     Jerry is mouse.'''
     texts = {}
+    #cl = cleanText.cleanGenText()
     #cl = cleanGenText()
+    corpus = corpusText.split("\n\n")
+    for pos in range(len(corpus)):
+      if len(corpus[pos]) > 0:
         texts["Paragraph "+str(pos)] = []
+        lines = corpus[pos]
         for line in lines.split("\n"):
           for l in line.split("."):
             if len(l) > 0:
+              l = self.cl.removeTabWhiteSpaceNewLine(l)
               l = l.lower()
               newL = []
               for word in l.split(" "):
                 if len(word) > 0:
+                  word = self.cl.removeStopWords(word)
                   for w in word:
                     if len(w) > 0 and w.isnumeric()==False:
                       newL.append(w)
         if len(texts["Paragraph "+str(pos)]) == 0:
           del texts["Paragraph "+str(pos)]
     return texts
+  def selectParaForWC(self, corpus):
+    """
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
+    Heuristically determine Word2Vec parameters.
+    """
     corSize = len(corpus)
+    if corSize == 0:
+        return None, None, None, None, None, None
+    # Adjust parameters based on corpus size
+    if corSize < 2000:
+        # Small corpus — need high generalization
+        window = 3
+        vector_size = 100
+        sample = 1e-3
+        negative = 5
+        epochs = 20
+        sg = 1  # Skip-gram preferred for rare words
+    elif corSize < 10000:
+        window = 5
+        vector_size = 150
+        sample = 1e-4
+        negative = 10
+        epochs = 20
+        sg = 1
+    elif corSize < 100000:
+        window = 7
+        vector_size = 200
+        sample = 1e-5
+        negative = 15
+        epochs = 15
+        sg = 1
+    elif corSize < 500000:
+        window = 10
+        vector_size = 250
+        sample = 1e-5
+        negative = 15
+        epochs = 10
+        sg = 0  # CBOW is okay when data is large
+    else:
+        # Very large corpus
+        window = 12
+        vector_size = 300
+        sample = 1e-6
+        negative = 20
+        epochs = 5
+        sg = 0
     return window, vector_size, sample, negative, epochs, sg
+  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
+                    vector_size=None,sample=None,negative=None,epochs=None,sg=None):
     jsonFile = ""
     jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
+    if not jsonFile:
+        print("No corpus to train")
+        return
     cores = multiprocessing.cpu_count()
     combinedCorpus = []
+    for key in jsonFile:
+      combinedCorpus.extend(jsonFile[key])
+    # detect phrase before choosing parameters
+    phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
+    bigram = Phraser(phrases)
+    combinedCorpus = [bigram[sent] for sent in combinedCorpus]
+    if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
       window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
+    # # min_count=1 ensures all words are included
+    #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
+    accept = False
+    # add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
+    retries = 0
+    while not accept and retries < 3:
+      if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
+        try:
+          w2vModel = Word2Vec(
                           min_count=1,
                           window=window,
                           vector_size=vector_size,
                           negative=negative,
                           workers=cores-1,
                           epochs = epochs,
+                          sg=sg)
+          w2vModel.build_vocab(combinedCorpus)
+          w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
+          accept = True
+        except Exception as e:
+          print(f"Retry #{retries+1} failed: {e}")
+          retries +=1
+      else:
+        print("no parameter to train")
+        break
+    #w2vModel.build_vocab(combinedCorpus)
+    #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
+    #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
+    #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
+    w2vModel.save(saveFolder+"/"+modelName+".model")
+    w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
+    print("done w2v")
     #return combinedCorpus
+  def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
+    if not newCorpus:
+        raise ValueError("New corpus is empty!")
+    model = Word2Vec.load(modelPath)
+    # Phrase detection on new data
+    phrases = Phrases(newCorpus, min_count=2, threshold=10)
+    bigram = Phraser(phrases)
+    newCorpus = [bigram[sent] for sent in newCorpus]
+    # Update vocab & retrain
+    model.build_vocab(newCorpus, update=True)
+    model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
   def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
     # might not be a meaningful keyword
     #stopWords = ["show"]
           results.append(moreNewResult)
       currN +=1'''
     return results
+  # add more data to existing word2vec model
+  def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
+    if not newCorpus:
+        raise ValueError("New corpus is empty!")
+    model = Word2Vec.load(modelPath)
+    # Phrase detection on new data
+    phrases = Phrases(newCorpus, min_count=2, threshold=10)
+    bigram = Phraser(phrases)
+    newCorpus = [bigram[sent] for sent in newCorpus]
+    # Update vocab & retrain
+    model.build_vocab(newCorpus, update=True)
+    model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
+    # Save updated model
+    if saveFolder:
+        os.makedirs(saveFolder, exist_ok=True)
+        name = os.path.basename(modelPath).replace(".model", "_updated.model")
+        model.save(f"{saveFolder}/{name}")
+        print(f"🔁 Model updated and saved to {saveFolder}/{name}")
+    else:
+        model.save(modelPath)
+        print(f"🔁 Model updated and overwritten at {modelPath}")
   # adding our model into spacy
   # this deals with command line; but instead of using it, we write python script to run command line
   def loadWordVec(self,modelName,wordVec):
                     modelName, # this modelName comes from the saved modelName of function trainWord2Vec
                     "--vectors-loc",
                     wordVec])
+>>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI  before moving to UpdateAppUI)
     print("done")

README.md CHANGED Viewed

@@ -1,15 +1,74 @@
----
-setup: bash setup.sh
-title: MtDNALocation
-emoji: 📊
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: 5.25.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: mtDNA Location Classification tool
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+setup: bash setup.sh
+title: MtDNALocation
+emoji: 📊
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.25.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: mtDNA Location Classification tool
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Installation
+## Set up environments and start GUI:
+```bash
+git clone https://github.com/Open-Access-Bio-Data/mtDNA-Location-Classifier.git
+```
+If installed using mamba (recommended):
+```bash
+mamba env create -f env.yaml
+```
+If not, check current python version in terminal and make sure that it is python version 3.10, then run
+```bash
+pip install -r requirements.txt
+```
+To start the programme, run this in terminal:
+```bash
+python app.py
+```
+Then follow its instructions
+# Descriptions:
+mtDNA-Location-Classifier uses [Gradio](https://www.gradio.app/docs) to handle the front-end interactions.
+The programme takes **an accession number** (an NCBI GenBank/nuccore identifier) as input and returns the likely origin of the sequence through `classify_sample_location_cached(accession=accession_number)`. This function wraps around a pipeline that proceeds as follow:
+## Steps 1-3: Check and retrieve base materials: the Pubmed ID, isolate, DOI and text:
+- Which are respectively:
+### Step 1: pubmed_ids and isolates
+        `get_info_from accession(accession=accession_number)`
+    - Current input is a string of `accession_number` and output are two lists, one of PUBMED IDs and one of isolate(s).
+    - Which look through the metadata of the sequence with `accession_number` and extract `PUBMED ID` if available or `isolate` information.
+    - The presence of PUBMED ID is currently important for the retrieval of texts in the next steps, which are eventually used by method 4.1 (question-answering) and 4.2 (infer from haplogroup)
+    - Some sequences might not have `isolate` info but its availibity is optional. (as they might be used by method 4.1 and 4.2 as alternative)
+### Step 2: dois
+        `get_doi_from_pubmed_id(pubmed_ids = pubmed_ids)`
+    - Input is a list of PUBMED IDs of the sequence with `accession_number` (retrieved from previous step) and output is a dictionary with keys = PUBMED IDs and values = according DOIs.
+    - The pubmed_ids are retrieved from the `get_info_from accession(accession=accession_number)` mentioned above.
+    - The DOIs will be passed down to dependent functions to extract texts of publications to pass on to method 4.1 and 4.2
+### Step 3: get text
+        `get_paper_text(dois = dois)`
+    - Input is currently a list of dois retrieved from previous step and output is a dictionary with keys = sources (doi links or file type) (We might improve this to have other inputs in addition to just doi links - maybe files); values = texts obtained from sources.
+    - Output of this step is crucial to method 4.1 and 4.2
+## Step 4: Prediction of origin:
+### Method 4.0:
+    - The first method attempts to directly look in the metadata for information that was submitted along with the sequence. Thus, it does not require availability of PUBMED IDs/DOIs or isolates.
+    - However, this information is not always available in the submission. Thus, we use other methods (4.1,4.2) to retrieve publications through which we can extract the information of the source of mtDNA
+### Method 4.1:
+    -
+### Method 4.2:
+    -
+## More in the package
+### extraction of text from HTML
+### extraction of text from PDF

accessions.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Accession
+KU131308
+JX123456
+MN908947
+AB123456
+AY123456

accessions.xlsx ADDED Viewed

Binary file (4.98 kB). View file

app.py CHANGED Viewed

@@ -1,3 +1,177 @@
 # ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
 import gradio as gr
@@ -5,9 +179,15 @@ from collections import Counter
 import csv
 import os
 from functools import lru_cache
-from mtdna_classifier import classify_sample_location
 import subprocess
 import json
 @lru_cache(maxsize=128)
 def classify_sample_location_cached(accession):
@@ -33,8 +213,6 @@ def compute_final_suggested_location(rows):
     return counts, (top_location, count)
 # Store feedback (with required fields)
-import gspread
-from oauth2client.service_account import ServiceAccountCredentials
 '''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
@@ -58,11 +236,6 @@ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
     except Exception as e:
         return f"❌ Error submitting feedback: {str(e)}"'''
-import os
-import json
-from oauth2client.service_account import ServiceAccountCredentials
-import gspread
 def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
     if not answer1.strip() or not answer2.strip():
         return "⚠️ Please answer both questions before submitting."
@@ -84,16 +257,44 @@ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
     except Exception as e:
         return f"❌ Error submitting feedback: {e}"
 def summarize_results(accession):
     try:
-        output = classify_sample_location_cached(accession)
         print(output)
     except Exception as e:
-        return [], f"❌ Error: {e}"
     if accession not in output:
-        return [], "❌ Accession not found in results."
     isolate = next((k for k in output if k != accession), None)
     row_score = []
@@ -110,7 +311,7 @@ def summarize_results(accession):
                 haplogroup = content.get("haplogroup", "")
                 inferred = content.get("inferred_location", "")
                 context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
                 row = {
                     "Sample ID": sample_id_label,
                     "Technique": technique,
@@ -130,43 +331,202 @@ def summarize_results(accession):
     summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
     summary = "\n".join(summary_lines)
-    return rows, summary
 # Gradio UI
 with gr.Blocks() as interface:
     gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
-    gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
     with gr.Row():
-        accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
         run_button = gr.Button("🔍 Submit and Classify")
         reset_button = gr.Button("🔄 Reset")
     status = gr.Markdown(visible=False)
-    headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
-    output_table = gr.Dataframe(headers=headers, interactive=False)
-    output_summary = gr.Markdown()
-    gr.Markdown("---")
-    gr.Markdown("### 💬 Feedback (required)")
-    q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
-    q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
-    contact = gr.Textbox(label="📧 Your email or institution (optional)")
-    submit_feedback = gr.Button("✅ Submit Feedback")
-    feedback_status = gr.Markdown()
-    def classify_with_loading(accession):
         return gr.update(value="⏳ Please wait... processing...", visible=True)
     def classify_main(accession):
-        table, summary = summarize_results(accession)
-        return table, summary, gr.update(visible=False)
     def reset_fields():
-        return "", "", "", "", "", [], "", gr.update(visible=False)
-    run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
-    run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
-    submit_feedback.click(fn=store_feedback_to_google_sheets, inputs=[accession, q1, q2, contact], outputs=feedback_status)
-    reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
 interface.launch(share=True)

+<<<<<<< HEAD
+# ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
+import gradio as gr
+from collections import Counter
+import csv
+import os
+from functools import lru_cache
+from mtdna_classifier import classify_sample_location
+import subprocess
+import json
+@lru_cache(maxsize=128)
+def classify_sample_location_cached(accession):
+    return classify_sample_location(accession)
+# Count and suggest final location
+def compute_final_suggested_location(rows):
+    candidates = [
+        row.get("Predicted Location", "").strip()
+        for row in rows
+        if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
+    ] + [
+        row.get("Inferred Region", "").strip()
+        for row in rows
+        if row.get("Inferred Region", "").strip().lower() not in  ["", "sample id not found", "unknown"]
+    ]
+    if not candidates:
+        return Counter(), ("Unknown", 0)
+    counts = Counter(candidates)
+    top_location, count = counts.most_common(1)[0]
+    return counts, (top_location, count)
+# Store feedback (with required fields)
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+'''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
+    if not answer1.strip() or not answer2.strip():
+        return "⚠️ Please answer both questions before submitting."
+    try:
+        # Define the scope and authenticate
+        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+        creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
+        client = gspread.authorize(creds)
+        # Open the spreadsheet and worksheet
+        sheet = client.open("feedback_mtdna").sheet1  # You can change the name
+        sheet.append_row([accession, answer1, answer2, contact])
+        return "✅ Feedback submitted. Thank you!"
+    except Exception as e:
+        return f"❌ Error submitting feedback: {str(e)}"'''
+import os
+import json
+from oauth2client.service_account import ServiceAccountCredentials
+import gspread
+def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
+    if not answer1.strip() or not answer2.strip():
+        return "⚠️ Please answer both questions before submitting."
+    try:
+        # ✅ Step: Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        # Connect to Google Sheet
+        client = gspread.authorize(creds)
+        sheet = client.open("feedback_mtdna").sheet1  # make sure sheet name matches
+        # Append feedback
+        sheet.append_row([accession, answer1, answer2, contact])
+        return "✅ Feedback submitted. Thank you!"
+    except Exception as e:
+        return f"❌ Error submitting feedback: {e}"
+def summarize_results(accession):
+    try:
+        output = classify_sample_location_cached(accession)
+        print(output)
+    except Exception as e:
+        return [], f"❌ Error: {e}"
+    if accession not in output:
+        return [], "❌ Accession not found in results."
+    isolate = next((k for k in output if k != accession), None)
+    row_score = []
+    rows = []
+    for key in [accession, isolate]:
+        if key not in output:
+            continue
+        sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
+        for section, techniques in output[key].items():
+            for technique, content in techniques.items():
+                source = content.get("source", "")
+                predicted = content.get("predicted_location", "")
+                haplogroup = content.get("haplogroup", "")
+                inferred = content.get("inferred_location", "")
+                context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
+                row = {
+                    "Sample ID": sample_id_label,
+                    "Technique": technique,
+                    "Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
+                    "Predicted Location": "" if technique == "haplogroup" else predicted,
+                    "Haplogroup": haplogroup if technique == "haplogroup" else "",
+                    "Inferred Region": inferred if technique == "haplogroup" else "",
+                    "Context Snippet": context
+                }
+                row_score.append(row)
+                rows.append(list(row.values()))
+    location_counts, (final_location, count) = compute_final_suggested_location(row_score)
+    summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
+    summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
+    summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
+    summary = "\n".join(summary_lines)
+    return rows, summary
+# Gradio UI
+with gr.Blocks() as interface:
+    gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
+    gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
+    with gr.Row():
+        accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
+        run_button = gr.Button("🔍 Submit and Classify")
+        reset_button = gr.Button("🔄 Reset")
+    status = gr.Markdown(visible=False)
+    headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
+    output_table = gr.Dataframe(headers=headers, interactive=False)
+    output_summary = gr.Markdown()
+    gr.Markdown("---")
+    gr.Markdown("### 💬 Feedback (required)")
+    q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
+    q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
+    contact = gr.Textbox(label="📧 Your email or institution (optional)")
+    submit_feedback = gr.Button("✅ Submit Feedback")
+    feedback_status = gr.Markdown()
+    def classify_with_loading(accession):
+        return gr.update(value="⏳ Please wait... processing...", visible=True)
+    def classify_main(accession):
+        table, summary = summarize_results(accession)
+        return table, summary, gr.update(visible=False)
+    def reset_fields():
+        return "", "", "", "", "", [], "", gr.update(visible=False)
+    run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
+    run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
+    submit_feedback.click(fn=store_feedback_to_google_sheets, inputs=[accession, q1, q2, contact], outputs=feedback_status)
+    reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
+interface.launch(share=True)
+=======
 # ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
 import gradio as gr
 import csv
 import os
 from functools import lru_cache
+from mtdna_classifier import classify_sample_location
 import subprocess
 import json
+import pandas as pd
+import io
+import re
+import tempfile
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
 @lru_cache(maxsize=128)
 def classify_sample_location_cached(accession):
     return counts, (top_location, count)
 # Store feedback (with required fields)
 '''creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
     except Exception as e:
         return f"❌ Error submitting feedback: {str(e)}"'''
 def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
     if not answer1.strip() or not answer2.strip():
         return "⚠️ Please answer both questions before submitting."
     except Exception as e:
         return f"❌ Error submitting feedback: {e}"
+# helper function to extract accessions
+def extract_accessions_from_input(file=None, raw_text=""):
+    print(f"RAW TEXT RECEIVED: {raw_text}")
+    accessions = []
+    seen = set()
+    if file:
+        try:
+            if file.name.endswith(".csv"):
+                df = pd.read_csv(file)
+            elif file.name.endswith(".xlsx"):
+                df = pd.read_excel(file)
+            else:
+                return [], "Unsupported file format. Please upload CSV or Excel."
+            for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
+                if acc not in seen:
+                    accessions.append(acc)
+                    seen.add(acc)
+        except Exception as e:
+            return [], f"Failed to read file: {e}"
+    if raw_text:
+        text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
+        for acc in text_ids:
+            if acc not in seen:
+                accessions.append(acc)
+                seen.add(acc)
+    return list(accessions), None
 def summarize_results(accession):
     try:
+        output, labelAncient_Modern, explain_label = classify_sample_location_cached(accession)
         print(output)
     except Exception as e:
+        return [], f"Error: {e}"
     if accession not in output:
+        return [], "Accession not found in results."
     isolate = next((k for k in output if k != accession), None)
     row_score = []
                 haplogroup = content.get("haplogroup", "")
                 inferred = content.get("inferred_location", "")
                 context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
                 row = {
                     "Sample ID": sample_id_label,
                     "Technique": technique,
     summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
     summary = "\n".join(summary_lines)
+    return rows, summary, labelAncient_Modern, explain_label
+# save the batch input in excel file
+def save_to_excel(all_rows, summary_text, flag_text, filename):
+    with pd.ExcelWriter(filename) as writer:
+        # Save table
+        df = pd.DataFrame(all_rows, columns=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"])
+        df.to_excel(writer, sheet_name="Detailed Results", index=False)
+        # Save summary
+        summary_df = pd.DataFrame({"Summary": [summary_text]})
+        summary_df.to_excel(writer, sheet_name="Summary", index=False)
+        # Save flag
+        flag_df = pd.DataFrame({"Flag": [flag_text]})
+        flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
+# save the batch input in JSON file
+def save_to_json(all_rows, summary_text, flag_text, filename):
+    output_dict = {
+        "Detailed_Results": all_rows,
+        "Summary_Text": summary_text,
+        "Ancient_Modern_Flag": flag_text
+    }
+    with open(filename, "w") as f:
+        json.dump(output_dict, f, indent=2)
+# save the batch input in Text file
+def save_to_txt(all_rows, summary_text, flag_text, filename):
+    with open(filename, "w") as f:
+        f.write("=== Detailed Results ===\n")
+        for row in all_rows:
+            f.write(", ".join(str(x) for x in row) + "\n")
+        f.write("\n=== Summary ===\n")
+        f.write(summary_text + "\n")
+        f.write("\n=== Ancient/Modern Flag ===\n")
+        f.write(flag_text + "\n")
+def save_batch_output(all_rows, summary_text, flag_text, output_type):
+    tmp_dir = tempfile.mkdtemp()
+    if output_type == "Excel":
+        file_path = f"{tmp_dir}/batch_output.xlsx"
+        save_to_excel(all_rows, summary_text, flag_text, file_path)
+    elif output_type == "JSON":
+        file_path = f"{tmp_dir}/batch_output.json"
+        save_to_json(all_rows, summary_text, flag_text, file_path)
+    elif output_type == "TXT":
+        file_path = f"{tmp_dir}/batch_output.txt"
+        save_to_txt(all_rows, summary_text, flag_text, file_path)
+    else:
+        return None  # invalid option
+    return file_path
+# run the batch
+def summarize_batch(file=None, raw_text=""):
+    accessions, error = extract_accessions_from_input(file, raw_text)
+    if error:
+        return [], "", "", f"Error: {error}"
+    all_rows = []
+    all_summaries = []
+    all_flags = []
+    for acc in accessions:
+        try:
+            rows, summary, label, explain = summarize_results(acc)
+            all_rows.extend(rows)
+            all_summaries.append(f"**{acc}**\n{summary}")
+            all_flags.append(f"**{acc}**: {label}\n_Explanation:_ {explain}")
+        except Exception as e:
+            all_summaries.append(f"**{acc}**: Failed - {e}")
+    summary_text = "\n\n---\n\n".join(all_summaries)
+    flag_text = "\n\n".join(all_flags)
+    return all_rows, summary_text, flag_text, gr.update(visible=False)
 # Gradio UI
 with gr.Blocks() as interface:
     gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
+    inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
+    with gr.Group() as single_input_group:
+        single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
+    with gr.Group(visible=False) as batch_input_group:
+        raw_text = gr.Textbox(label="🧬 Paste Accession Numbers")
+        file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
+        print(raw_text)
+        # Make the file box smaller
+        gr.HTML('<style>#file-upload-box { width: 200px; }</style>')
     with gr.Row():
         run_button = gr.Button("🔍 Submit and Classify")
         reset_button = gr.Button("🔄 Reset")
     status = gr.Markdown(visible=False)
+    with gr.Group(visible=False) as results_group:
+        with gr.Row():
+            with gr.Column():
+                output_summary = gr.Markdown()
+            with gr.Column():
+                output_flag = gr.Markdown()
+        gr.Markdown("---")
+        output_table = gr.Dataframe(
+            headers=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"],
+            interactive=False,
+            row_count=(5, "dynamic")
+        )
+        with gr.Row():
+            output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
+            download_button = gr.Button("⬇️ Download Output")
+            download_file = gr.File(label="Download File Here")
+        gr.Markdown("---")
+        gr.Markdown("### 💬 Feedback (required)")
+        q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
+        q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
+        contact = gr.Textbox(label="📧 Your email or institution (optional)")
+        submit_feedback = gr.Button("✅ Submit Feedback")
+        feedback_status = gr.Markdown()
+    # Functions
+    def toggle_input_mode(mode):
+        if mode == "Single Accession":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    def classify_with_loading():
         return gr.update(value="⏳ Please wait... processing...", visible=True)
+    def classify_dynamic(single_accession, file, text, mode):
+        print(f"MODE: {mode} | RAW TEXT: {text}")
+        if mode == "Single Accession":
+            return classify_main(single_accession)
+        else:
+            return summarize_batch(file, text)
     def classify_main(accession):
+        table, summary, labelAncient_Modern, explain_label = summarize_results(accession)
+        flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
+        return (
+            table,
+            summary,
+            flag_output,
+            gr.update(visible=True),
+            gr.update(visible=False)
+        )
     def reset_fields():
+        return (
+            gr.update(value=""),  # single_accession
+            gr.update(value=""),  # raw_text
+            gr.update(value=None), # file_upload
+            gr.update(value="Single Accession"), # inputMode
+            gr.update(value=[], visible=True), # output_table
+            gr.update(value="", visible=True), # output_summary
+            gr.update(value="", visible=True), # output_flag
+            gr.update(visible=False), # status
+            gr.update(visible=False)  # results_group
+        )
+    inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
+    run_button.click(fn=classify_with_loading, inputs=[], outputs=status)
+    run_button.click(
+        fn=classify_dynamic,
+        inputs=[single_accession, file_upload, raw_text, inputMode],
+        outputs=[output_table, output_summary, output_flag, results_group, status]
+    )
+    reset_button.click(
+        fn=reset_fields,
+        inputs=[],
+        outputs=[
+            single_accession, raw_text, file_upload, inputMode,
+            output_table, output_summary, output_flag,
+            status, results_group
+        ]
+    )
+    download_button.click(
+        save_batch_output, [output_table, output_summary, output_flag, output_type], download_file
+    )
+    submit_feedback.click(
+        fn=store_feedback_to_google_sheets, inputs=[single_accession, q1, q2, contact], outputs=feedback_status
+    )
 interface.launch(share=True)
+>>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI  before moving to UpdateAppUI)

data/user_fb/feedback_mtdna.xlsx ADDED Viewed

Binary file (5.93 kB). View file

env.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: mtDNA
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - pip
+  - pip:
+    - -r requirements.txt

installedAndUsedRequirements.txt ADDED Viewed

	@@ -0,0 +1,637 @@

+python_version==3.11.12
+absl-py==1.4.0
+accelerate==1.6.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.15
+aiosignal==1.3.2
+alabaster==1.0.0
+albucore==0.0.24
+albumentations==2.0.6
+ale-py==0.11.0
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+array_record==0.7.2
+arviz==0.21.0
+astropy==7.0.1
+astropy-iers-data==0.2025.4.28.0.37.27
+astunparse==1.6.3
+atpublic==5.1
+attrs==25.3.0
+audioread==3.0.1
+autograd==1.7.0
+babel==2.17.0
+backcall==0.2.0
+backports.tarfile==1.2.0
+beautifulsoup4==4.13.4
+betterproto==2.0.0b6
+bigframes==2.1.0
+bigquery-magics==0.9.0
+biopython==1.85
+bitarray==3.4.0
+bleach==6.2.0
+blinker==1.9.0
+blis==1.2.1
+blosc2==3.3.2
+bokeh==3.7.2
+Bottleneck==1.4.2
+bqplot==0.12.44
+branca==0.8.1
+bs4==0.0.2
+build==1.2.2.post1
+CacheControl==0.14.3
+cachetools==5.5.2
+catalogue==2.0.10
+certifi==2025.4.26
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.1
+chex==0.1.89
+clarabel==0.10.0
+click==8.1.8
+cloudpathlib==0.21.0
+cloudpickle==3.1.1
+cmake==3.31.6
+cmdstanpy==1.2.5
+colorcet==3.1.0
+colorlover==0.3.0
+colour==0.1.5
+community==1.0.0b1
+confection==0.1.5
+cons==0.4.6
+contourpy==1.3.2
+cramjam==2.10.0
+cryptography==43.0.3
+cuda-python==12.6.2.post1
+cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+cudf-polars-cu12==25.2.2
+cufflinks==0.17.3
+cuml-cu12==25.2.1
+cupy-cuda12x==13.3.0
+cuvs-cu12==25.2.1
+cvxopt==1.3.2
+cvxpy==1.6.5
+cycler==0.12.1
+cyipopt==1.5.0
+cymem==2.0.11
+Cython==3.0.12
+dask==2024.12.1
+dask-cuda==25.2.0
+dask-cudf-cu12==25.2.2
+dask-expr==1.1.21
+dataproc-spark-connect==0.7.2
+datascience==0.17.6
+db-dtypes==1.4.2
+dbus-python==1.2.18
+debugpy==1.8.0
+decorator==4.4.2
+defusedxml==0.7.1
+Deprecated==1.2.18
+diffusers==0.33.1
+distributed==2024.12.1
+distributed-ucxx-cu12==0.42.0
+distro==1.9.0
+dlib==19.24.6
+dm-tree==0.1.9
+docker-pycreds==0.4.0
+docstring_parser==0.16
+docutils==0.21.2
+dopamine_rl==4.1.2
+duckdb==1.2.2
+earthengine-api==1.5.13
+easydict==1.13
+editdistance==0.8.1
+eerepr==0.1.1
+einops==0.8.1
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
+entrypoints==0.4
+et_xmlfile==2.0.0
+etils==1.12.2
+etuples==0.3.9
+Farama-Notifications==0.0.4
+fastai==2.7.19
+fastapi==0.115.12
+fastcore==1.7.29
+fastdownload==0.0.7
+fastjsonschema==2.21.1
+fastprogress==1.0.3
+fastrlock==0.8.3
+ffmpy==0.5.0
+filelock==3.18.0
+firebase-admin==6.8.0
+Flask==3.1.0
+flatbuffers==25.2.10
+flax==0.10.6
+folium==0.19.5
+fonttools==4.57.0
+frozendict==2.4.6
+frozenlist==1.6.0
+fsspec==2025.3.2
+future==1.0.0
+gast==0.6.0
+gcsfs==2025.3.2
+GDAL==3.6.4
+gdown==5.2.0
+geemap==0.35.3
+gensim==4.3.3
+geocoder==1.38.1
+geographiclib==2.0
+geopandas==1.0.1
+geopy==2.4.1
+gin-config==0.5.0
+gitdb==4.0.12
+GitPython==3.1.44
+glob2==0.7
+google==2.0.3
+google-ai-generativelanguage==0.6.15
+google-api-core==2.24.2
+google-api-python-client==2.169.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.2
+google-cloud-aiplatform==1.91.0
+google-cloud-bigquery==3.31.0
+google-cloud-bigquery-connection==1.18.2
+google-cloud-bigquery-storage==2.31.0
+google-cloud-bigtable==2.30.1
+google-cloud-core==2.4.3
+google-cloud-dataproc==5.18.1
+google-cloud-datastore==2.21.0
+google-cloud-firestore==2.20.2
+google-cloud-functions==1.20.3
+google-cloud-iam==2.19.0
+google-cloud-language==2.17.1
+google-cloud-pubsub==2.25.0
+google-cloud-resource-manager==1.14.2
+google-cloud-spanner==3.54.0
+google-cloud-storage==2.19.0
+google-cloud-translate==3.20.2
+google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
+google-crc32c==1.7.1
+google-genai==1.13.0
+google-generativeai==0.8.5
+google-pasta==0.2.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.70.0
+googledrivedownloader==1.1.0
+gradio==5.29.0
+gradio_client==1.10.0
+graphviz==0.20.3
+greenlet==3.2.1
+groovy==0.1.2
+grpc-google-iam-v1==0.14.2
+grpc-interceptor==0.15.4
+grpcio==1.71.0
+grpcio-status==1.71.0
+grpclib==0.4.7
+gspread==6.2.0
+gspread-dataframe==4.0.0
+gym==0.25.2
+gym-notices==0.0.8
+gymnasium==1.1.1
+h11==0.16.0
+h2==4.2.0
+h5netcdf==1.6.1
+h5py==3.13.0
+hdbscan==0.8.40
+highspy==1.10.0
+holidays==0.71
+holoviews==1.20.2
+hpack==4.1.0
+html5lib==1.1
+httpcore==1.0.9
+httpimport==1.4.1
+httplib2==0.22.0
+httpx==0.28.1
+huggingface-hub==0.30.2
+humanize==4.12.3
+hyperframe==6.1.0
+hyperopt==0.2.7
+ibis-framework==9.5.0
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+imagesize==1.4.1
+imbalanced-learn==0.13.0
+immutabledict==4.2.1
+importlib_metadata==8.7.0
+importlib_resources==6.5.2
+imutils==0.5.4
+inflect==7.5.0
+iniconfig==2.1.0
+intel-cmplr-lib-ur==2025.1.1
+intel-openmp==2025.1.1
+ipyevents==2.0.2
+ipyfilechooser==0.6.0
+ipykernel==6.17.1
+ipyleaflet==0.19.2
+ipyparallel==8.8.0
+ipython==7.34.0
+ipython-genutils==0.2.0
+ipython-sql==0.5.0
+ipytree==0.2.2
+ipywidgets==7.7.1
+itsdangerous==2.2.0
+jaraco.classes==3.4.0
+jaraco.context==6.0.1
+jaraco.functools==4.1.0
+jax==0.5.2
+jax-cuda12-pjrt==0.5.1
+jax-cuda12-plugin==0.5.1
+jaxlib==0.5.1
+jeepney==0.9.0
+jieba==0.42.1
+Jinja2==3.1.6
+jiter==0.9.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpickle==4.0.5
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+jupyter-client==6.1.12
+jupyter-console==6.1.0
+jupyter-leaflet==0.19.2
+jupyter-server==1.16.0
+jupyter_core==5.7.2
+jupyter_kernel_gateway @ git+https://github.com/googlecolab/kernel_gateway@b134e9945df25c2dcb98ade9129399be10788671
+jupyterlab_pygments==0.3.0
+jupyterlab_widgets==3.0.14
+kaggle==1.7.4.2
+kagglehub==0.3.12
+keras==3.8.0
+keras-hub==0.18.1
+keras-nlp==0.18.1
+keyring==25.6.0
+keyrings.google-artifactregistry-auth==1.1.2
+kiwisolver==1.4.8
+langchain==0.3.24
+langchain-core==0.3.56
+langchain-text-splitters==0.3.8
+langcodes==3.5.0
+langsmith==0.3.39
+language_data==1.3.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lazy_loader==0.4
+libclang==18.1.1
+libcudf-cu12 @ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-25.2.1-py3-none-manylinux_2_28_x86_64.whl
+libcugraph-cu12==25.2.0
+libcuml-cu12==25.2.1
+libcuvs-cu12==25.2.1
+libkvikio-cu12==25.2.1
+libraft-cu12==25.2.0
+librosa==0.11.0
+libucx-cu12==1.18.1
+libucxx-cu12==0.42.0
+lightgbm @ file:///tmp/lightgbm/LightGBM/dist/lightgbm-4.5.0-py3-none-linux_x86_64.whl
+linkify-it-py==2.0.3
+llvmlite==0.43.0
+locket==1.0.0
+logical-unification==0.4.6
+lxml==5.4.0
+Mako==1.1.3
+marisa-trie==1.2.1
+Markdown==3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.0
+matplotlib-inline==0.1.7
+matplotlib-venn==1.1.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+miniKanren==1.0.3
+missingno==0.5.2
+mistune==3.1.3
+mizani==0.13.3
+mkl==2025.0.1
+ml-dtypes==0.4.1
+mlxtend==0.23.4
+more-itertools==10.7.0
+moviepy==1.0.3
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.4.3
+multipledispatch==1.0.0
+multitasking==0.0.11
+murmurhash==1.0.12
+music21==9.3.0
+namex==0.0.9
+narwhals==1.37.1
+natsort==8.4.0
+nbclassic==1.3.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+ndindex==1.9.2
+nest-asyncio==1.6.0
+networkx==3.4.2
+nibabel==5.3.2
+nltk==3.9.1
+notebook==6.5.7
+notebook_shim==0.2.4
+numba==0.60.0
+numba-cuda==0.2.0
+numexpr==2.10.2
+numpy==1.25.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvcc-cu12==12.5.82
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py==12.570.86
+nvidia-nccl-cu12==2.21.5
+nvidia-nvcomp-cu12==4.2.0.11
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+nvtx==0.2.11
+nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-25.2.0-py3-none-any.whl
+oauth2client==4.1.3
+oauthlib==3.2.2
+openai==1.76.2
+opencv-contrib-python==4.11.0.86
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+openpyxl==3.1.5
+opentelemetry-api==1.16.0
+opentelemetry-sdk==1.16.0
+opentelemetry-semantic-conventions==0.37b0
+opt_einsum==3.4.0
+optax==0.2.4
+optree==0.15.0
+orbax-checkpoint==0.11.13
+orjson==3.10.18
+osqp==1.0.3
+packaging==24.2
+pandas==2.2.2
+pandas-datareader==0.10.0
+pandas-gbq==0.28.0
+pandas-stubs==2.2.2.240909
+pandocfilters==1.5.1
+panel==1.6.3
+param==2.2.0
+parso==0.8.4
+parsy==2.1
+partd==1.4.2
+pathlib==1.0.1
+patsy==1.0.1
+pdfreader==0.1.15
+peewee==3.18.1
+peft==0.15.2
+pexpect==4.9.0
+pickleshare==0.7.5
+pillow==11.2.1
+platformdirs==4.3.7
+plotly==5.24.1
+plotnine==0.14.5
+pluggy==1.5.0
+plum-dispatch==1.7.4
+ply==3.11
+polars==1.21.0
+pooch==1.8.2
+portpicker==1.5.2
+preshed==3.0.9
+prettytable==3.16.0
+proglog==0.1.11
+progressbar2==4.5.0
+prometheus_client==0.21.1
+promise==2.3
+prompt_toolkit==3.0.51
+propcache==0.3.1
+prophet==1.1.6
+proto-plus==1.26.1
+protobuf==5.29.4
+psutil==5.9.5
+psycopg2==2.9.10
+ptyprocess==0.7.0
+py-cpuinfo==9.0.0
+py4j==0.10.9.7
+pyarrow==18.1.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycairo==1.28.0
+pycocotools==2.0.8
+pycparser==2.22
+pycryptodome==3.22.0
+pydantic==2.11.4
+pydantic_core==2.33.2
+pydata-google-auth==1.9.1
+pydot==3.0.4
+pydotplus==2.0.2
+PyDrive==1.3.1
+PyDrive2==1.21.3
+pydub==0.25.1
+pyerfa==2.0.1.5
+pygame==2.6.1
+pygit2==1.18.0
+Pygments==2.19.1
+PyGObject==3.42.0
+PyJWT==2.10.1
+pylibcudf-cu12 @ https://pypi.nvidia.com/pylibcudf-cu12/pylibcudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+pylibcugraph-cu12==25.2.0
+pylibraft-cu12==25.2.0
+pymc==5.22.0
+PyMuPDF==1.25.5
+pymystem3==0.2.0
+pynndescent==0.5.13
+pynvjitlink-cu12==0.5.2
+pynvml==12.0.0
+pyogrio==0.10.0
+pyomo==6.9.2
+PyOpenGL==3.1.9
+pyOpenSSL==24.2.1
+pyparsing==3.2.3
+pyperclip==1.9.0
+pyproj==3.7.1
+pyproject_hooks==1.2.0
+pyshp==2.3.1
+PySocks==1.7.1
+pyspark==3.5.1
+pytensor==2.30.3
+pytest==8.3.5
+python-apt==0.0.0
+python-box==7.3.2
+python-dateutil==2.9.0.post0
+python-louvain==0.16
+python-multipart==0.0.20
+python-slugify==8.0.4
+python-snappy==0.7.3
+python-utils==3.9.1
+pytz==2025.2
+pyviz_comms==3.0.4
+PyYAML==6.0.2
+pyzmq==24.0.1
+raft-dask-cu12==25.2.0
+RapidFuzz==3.13.0
+rapids-dask-dependency==25.2.0
+ratelim==0.1.6
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+requirements-parser==0.9.0
+rich==13.9.4
+rmm-cu12==25.2.0
+roman-numerals-py==3.1.0
+rpds-py==0.24.0
+rpy2==3.5.17
+rsa==4.9.1
+ruff==0.11.9
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-image==0.25.2
+scikit-learn==1.6.1
+scipy==1.13.1
+scooby==0.10.1
+scs==3.2.7.post2
+seaborn==0.13.2
+SecretStorage==3.3.3
+semantic-version==2.10.0
+Send2Trash==1.8.3
+sentence-transformers==3.4.1
+sentencepiece==0.2.0
+sentry-sdk==2.27.0
+setproctitle==1.3.6
+shap==0.47.2
+shapely==2.1.0
+shellingham==1.5.4
+simple-parsing==0.1.7
+simplejson==3.20.1
+simsimd==6.2.1
+six==1.17.0
+sklearn-compat==0.1.3
+sklearn-pandas==2.2.0
+slicer==0.0.8
+smart-open==7.1.0
+smmap==5.0.2
+sniffio==1.3.1
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soundfile==0.13.1
+soupsieve==2.7
+soxr==0.5.0.post1
+spacy==3.8.5
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy-lookups-data==1.0.5
+spanner-graph-notebook==1.1.6
+Sphinx==8.2.3
+sphinxcontrib-applehelp==2.0.0
+sphinxcontrib-devhelp==2.0.0
+sphinxcontrib-htmlhelp==2.1.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==2.0.0
+sphinxcontrib-serializinghtml==2.0.0
+spire-doc==13.4.6
+Spire.Xls==14.12.0
+SQLAlchemy==2.0.40
+sqlglot==25.20.2
+sqlparse==0.5.3
+srsly==2.5.1
+stanio==0.5.1
+starlette==0.46.2
+statsmodels==0.14.4
+stringzilla==3.12.5
+sympy==1.13.1
+tables==3.10.2
+tabula-py==2.10.0
+tabulate==0.9.0
+tbb==2022.1.0
+tblib==3.1.0
+tcmlib==1.3.0
+tenacity==9.1.2
+tensorboard==2.18.0
+tensorboard-data-server==0.7.2
+tensorflow==2.18.0
+tensorflow-datasets==4.9.8
+tensorflow-hub==0.16.1
+tensorflow-io-gcs-filesystem==0.37.1
+tensorflow-metadata==1.17.1
+tensorflow-probability==0.25.0
+tensorflow-text==2.18.1
+tensorflow_decision_forests==1.11.0
+tensorstore==0.1.74
+termcolor==3.1.0
+terminado==0.18.1
+text-unidecode==1.3
+textblob==0.19.0
+tf-slim==1.1.0
+tf_keras==2.18.0
+thefuzz==0.22.1
+thinc==8.3.4
+threadpoolctl==3.6.0
+tifffile==2025.3.30
+timm==1.0.15
+tinycss2==1.4.0
+tokenizers==0.21.1
+toml==0.10.2
+tomlkit==0.13.2
+toolz==0.12.1
+torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
+torchaudio @ https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
+torchsummary==1.5.1
+torchvision @ https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.7.1
+traittypes==0.2.1
+transformers==4.51.3
+treelite==4.4.1
+treescope==0.1.9
+triton==3.2.0
+tweepy==4.15.0
+typeguard==4.4.2
+typer==0.15.3
+types-pytz==2025.2.0.20250326
+types-setuptools==80.3.0.20250505
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+tzlocal==5.3.1
+uc-micro-py==1.0.3
+ucx-py-cu12==0.42.0
+ucxx-cu12==0.42.0
+umap-learn==0.5.7
+umf==0.10.0
+uritemplate==4.1.1
+urllib3==2.4.0
+uvicorn==0.34.2
+vega-datasets==0.9.0
+wadllib==1.3.6
+wandb==0.19.10
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==15.0.1
+Werkzeug==3.1.3
+widgetsnbextension==3.6.10
+wordcloud==1.9.4
+wordsegment==1.3.1
+wrapt==1.17.2
+wurlitzer==3.1.1
+xarray==2025.3.1
+xarray-einstats==0.8.0
+xgboost==2.1.4
+xlrd==2.0.1
+xyzservices==2025.4.0
+yarl==1.20.0
+ydf==0.11.0
+yellowbrick==1.5
+yfinance==0.2.57
+zict==3.0.0
+zipp==3.21.0
+zstandard==0.23.0

mtdna_backend.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import gradio as gr
+from collections import Counter
+import csv
+import os
+from functools import lru_cache
+from mtdna_classifier import classify_sample_location
+import subprocess
+import json
+import pandas as pd
+import io
+import re
+import tempfile
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+from io import StringIO
+@lru_cache(maxsize=128)
+def classify_sample_location_cached(accession):
+    return classify_sample_location(accession)
+# Count and suggest final location
+def compute_final_suggested_location(rows):
+    candidates = [
+        row.get("Predicted Location", "").strip()
+        for row in rows
+        if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
+    ] + [
+        row.get("Inferred Region", "").strip()
+        for row in rows
+        if row.get("Inferred Region", "").strip().lower() not in  ["", "sample id not found", "unknown"]
+    ]
+    if not candidates:
+        return Counter(), ("Unknown", 0)
+    # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
+    tokens = []
+    for item in candidates:
+        # Split by comma, whitespace, and newlines
+        parts = re.split(r'[\s,]+', item)
+        tokens.extend(parts)
+    # Step 2: Clean and normalize tokens
+    tokens = [word.strip() for word in tokens if word.strip().isalpha()]  # Keep only alphabetic tokens
+    # Step 3: Count
+    counts = Counter(tokens)
+    # Step 4: Get most common
+    top_location, count = counts.most_common(1)[0]
+    return counts, (top_location, count)
+# Store feedback (with required fields)
+def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
+    if not answer1.strip() or not answer2.strip():
+        return "⚠️ Please answer both questions before submitting."
+    try:
+        # ✅ Step: Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        # Connect to Google Sheet
+        client = gspread.authorize(creds)
+        sheet = client.open("feedback_mtdna").sheet1  # make sure sheet name matches
+        # Append feedback
+        sheet.append_row([accession, answer1, answer2, contact])
+        return "✅ Feedback submitted. Thank you!"
+    except Exception as e:
+        return f"❌ Error submitting feedback: {e}"
+# helper function to extract accessions
+def extract_accessions_from_input(file=None, raw_text=""):
+    print(f"RAW TEXT RECEIVED: {raw_text}")
+    accessions = []
+    seen = set()
+    if file:
+        try:
+            if file.name.endswith(".csv"):
+                df = pd.read_csv(file)
+            elif file.name.endswith(".xlsx"):
+                df = pd.read_excel(file)
+            else:
+                return [], "Unsupported file format. Please upload CSV or Excel."
+            for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
+                if acc not in seen:
+                    accessions.append(acc)
+                    seen.add(acc)
+        except Exception as e:
+            return [], f"Failed to read file: {e}"
+    if raw_text:
+        text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
+        for acc in text_ids:
+            if acc not in seen:
+                accessions.append(acc)
+                seen.add(acc)
+    return list(accessions), None
+def summarize_results(accession):
+    try:
+        output, labelAncient_Modern, explain_label = classify_sample_location_cached(accession)
+        #print(output)
+    except Exception as e:
+        return [], f"Error: {e}", f"Error: {e}", f"Error: {e}"
+    if accession not in output:
+        return [], "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
+    isolate = next((k for k in output if k != accession), None)
+    row_score = []
+    rows = []
+    for key in [accession, isolate]:
+        if key not in output:
+            continue
+        sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
+        for section, techniques in output[key].items():
+            for technique, content in techniques.items():
+                source = content.get("source", "")
+                predicted = content.get("predicted_location", "")
+                haplogroup = content.get("haplogroup", "")
+                inferred = content.get("inferred_location", "")
+                context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
+                row = {
+                    "Sample ID": sample_id_label,
+                    "Technique": technique,
+                    "Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
+                    "Predicted Location": "" if technique == "haplogroup" else predicted,
+                    "Haplogroup": haplogroup if technique == "haplogroup" else "",
+                    "Inferred Region": inferred if technique == "haplogroup" else "",
+                    "Context Snippet": context
+                }
+                row_score.append(row)
+                rows.append(list(row.values()))
+    location_counts, (final_location, count) = compute_final_suggested_location(row_score)
+    summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
+    summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
+    summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
+    summary = "\n".join(summary_lines)
+    return rows, summary, labelAncient_Modern, explain_label
+# save the batch input in excel file
+def save_to_excel(all_rows, summary_text, flag_text, filename):
+    with pd.ExcelWriter(filename) as writer:
+        # Save table
+        df = pd.DataFrame(all_rows, columns=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"])
+        df.to_excel(writer, sheet_name="Detailed Results", index=False)
+        # Save summary
+        summary_df = pd.DataFrame({"Summary": [summary_text]})
+        summary_df.to_excel(writer, sheet_name="Summary", index=False)
+        # Save flag
+        flag_df = pd.DataFrame({"Flag": [flag_text]})
+        flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
+# save the batch input in JSON file
+def save_to_json(all_rows, summary_text, flag_text, filename):
+    output_dict = {
+        "Detailed_Results": all_rows,  # <-- make sure this is a plain list, not a DataFrame
+        "Summary_Text": summary_text,
+        "Ancient_Modern_Flag": flag_text
+    }
+    # If all_rows is a DataFrame, convert it
+    if isinstance(all_rows, pd.DataFrame):
+        output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
+    with open(filename, "w") as external_file:
+        json.dump(output_dict, external_file, indent=2)
+# save the batch input in Text file
+def save_to_txt(all_rows, summary_text, flag_text, filename):
+    if isinstance(all_rows, pd.DataFrame):
+        detailed_results = all_rows.to_dict(orient="records")
+    output = ""
+    output += ",".join(list(detailed_results[0].keys())) + "\n\n"
+    for r in detailed_results:
+      output += ",".join([str(v) for v in r.values()]) + "\n\n"
+    with open(filename, "w") as f:
+        f.write("=== Detailed Results ===\n")
+        f.write(output + "\n")
+        f.write("\n=== Summary ===\n")
+        f.write(summary_text + "\n")
+        f.write("\n=== Ancient/Modern Flag ===\n")
+        f.write(flag_text + "\n")
+def save_batch_output(all_rows, summary_text, flag_text, output_type):
+    tmp_dir = tempfile.mkdtemp()
+    #html_table = all_rows.value  # assuming this is stored somewhere
+    # Parse back to DataFrame
+    #all_rows = pd.read_html(all_rows)[0]  # [0] because read_html returns a list
+    all_rows = pd.read_html(StringIO(all_rows))[0]
+    print(all_rows)
+    if output_type == "Excel":
+        file_path = f"{tmp_dir}/batch_output.xlsx"
+        save_to_excel(all_rows, summary_text, flag_text, file_path)
+    elif output_type == "JSON":
+        file_path = f"{tmp_dir}/batch_output.json"
+        save_to_json(all_rows, summary_text, flag_text, file_path)
+        print("Done with JSON")
+    elif output_type == "TXT":
+        file_path = f"{tmp_dir}/batch_output.txt"
+        save_to_txt(all_rows, summary_text, flag_text, file_path)
+    else:
+        return gr.update(visible=False)  # invalid option
+    return gr.update(value=file_path, visible=True)
+# run the batch
+def summarize_batch(file=None, raw_text=""):
+    accessions, error = extract_accessions_from_input(file, raw_text)
+    if error:
+        return [], "", "", f"Error: {error}"
+    all_rows = []
+    all_summaries = []
+    all_flags = []
+    for acc in accessions:
+        try:
+            rows, summary, label, explain = summarize_results(acc)
+            all_rows.extend(rows)
+            all_summaries.append(f"**{acc}**\n{summary}")
+            all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
+        except Exception as e:
+            all_summaries.append(f"**{acc}**: Failed - {e}")
+    """for row in all_rows:
+          source_column = row[2]  # Assuming the "Source" is in the 3rd column (index 2)
+          if source_column.startswith("http"):  # Check if the source is a URL
+              # Wrap it with HTML anchor tags to make it clickable
+              row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
+    summary_text = "\n\n---\n\n".join(all_summaries)
+    flag_text = "\n\n---\n\n".join(all_flags)
+    return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)

mtdna_classifier.py CHANGED Viewed

@@ -1,322 +1,519 @@
-# mtDNA Location Classifier MVP (Google Colab)
-# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
-import os
-import subprocess
-import re
-from Bio import Entrez
-import fitz
-import spacy
-from spacy.cli import download
-from NER.PDF import pdf
-from NER.WordDoc import wordDoc
-from NER.html import extractHTML
-from NER.word2Vec import word2vec
-from transformers import pipeline
-# Set your email (required by NCBI Entrez)
-#Entrez.email = "[email protected]"
-import nltk
-nltk.download("stopwords")
-nltk.download("punkt")
-nltk.download('punkt_tab')
-# Step 1: Get PubMed ID from Accession using EDirect
-'''def get_info_from_accession(accession):
-    cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
-    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    output = result.stdout
-    pubmedID, isolate = "", ""
-    for line in output.split("\n"):
-      if len(line) > 0:
-        if "PUBMED" in line:
-          pubmedID = line.split()[-1]
-        if "isolate" in line:  # Check for isolate information
-          # Try direct GenBank annotation: /isolate="XXX"
-          match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line)  # search on current line
-          if match1:
-            isolate = match1.group(1)
-          else:
-            # Try from DEFINITION line: ...isolate XXX...
-            match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
-            if match2:
-              isolate = match2.group(1)'''
-from Bio import Entrez, Medline
-import re
-Entrez.email = "[email protected]"
-def get_info_from_accession(accession):
-    try:
-        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
-        text = handle.read()
-        handle.close()
-        # Extract PUBMED ID from the Medline text
-        pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
-        pubmed_id = pubmed_match.group(1) if pubmed_match else ""
-        # Extract isolate if available
-        isolate_match = re.search(r'/isolate="([^"]+)"', text)
-        if not isolate_match:
-            isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
-        isolate = isolate_match.group(1) if isolate_match else ""
-        if not pubmed_id:
-            print(f"⚠️ No PubMed ID found for accession {accession}")
-        return pubmed_id, isolate
-    except Exception as e:
-        print("❌ Entrez error:", e)
-        return "", ""
-# Step 2: Get doi link to access the paper
-'''def get_doi_from_pubmed_id(pubmed_id):
-    cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
-    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    output = result.stdout
-    doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
-    match = re.search(doi_pattern, output, re.IGNORECASE)
-    if match:
-        return match.group(0)
-    else:
-        return None  # or raise an Exception with a helpful message'''
-def get_doi_from_pubmed_id(pubmed_id):
-    try:
-        handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
-        records = list(Medline.parse(handle))
-        handle.close()
-        if not records:
-            return None
-        record = records[0]
-        if "AID" in record:
-            for aid in record["AID"]:
-                if "[doi]" in aid:
-                    return aid.split(" ")[0]  # extract the DOI
-        return None
-    except Exception as e:
-        print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
-        return None
-# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
-# Step 3.1: Extract Text
-def get_paper_text(doi,id):
-  # create the temporary folder to contain the texts
-  cmd = f'mkdir data/{id}'
-  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-  saveLinkFolder = "data/"+id
-  link = 'https://doi.org/' + doi
-  '''textsToExtract = { "doiLink":"paperText"
-                        "file1.pdf":"text1",
-                        "file2.doc":"text2",
-                        "file3.xlsx":excelText3'''
-  textsToExtract = {}
-  # get the file to create listOfFile for each id
-  html = extractHTML.HTML("",link)
-  jsonSM = html.getSupMaterial()
-  text = ""
-  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
-  #print(links)
-  for l in links:
-    # get the main paper
-    if l == link:
-      text = html.getListSection()
-      textsToExtract[link] = text
-    elif l.endswith(".pdf"):
-      p = pdf.PDF(l,saveLinkFolder,doi)
-      f = p.openPDFFile()
-      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
-      doc = fitz.open(pdf_path)
-      text = "\n".join([page.get_text() for page in doc])
-      textsToExtract[l] = text
-    elif l.endswith(".doc") or l.endswith(".docx"):
-      d = wordDoc.wordDoc(l,saveLinkFolder)
-      text = d.extractTextByPage()
-      textsToExtract[l] = text
-    elif l.split(".")[-1].lower() in "xlsx":
-      wc = word2vec.word2Vec()
-      corpus = wc.tableTransformToCorpusText([],l)
-      text = ''
-      for c in corpus:
-        para = corpus[c]
-        for words in para:
-          text += " ".join(words)
-      textsToExtract[l] = text
-  # delete folder after finishing getting text
-  cmd = f'rm -r data/{id}'
-  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-  return textsToExtract
-# Step 3.2: Extract context
-def extract_context(text, keyword, window=500):
-    idx = text.find(keyword)
-    if idx == -1:
-        return "Sample ID not found."
-    return text[max(0, idx-window): idx+window]
-# Step 4: Classification for now (demo purposes)
-# 4.1: Using a HuggingFace model (question-answering)
-def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
-    try:
-        qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
-        result = qa({"context": context, "question": question})
-        return result.get("answer", "Unknown")
-    except Exception as e:
-        return f"Error: {str(e)}"
-# 4.2: Infer from haplogroup
-# Load pre-trained spaCy model for NER
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
-nlp = spacy.load("en_core_web_sm")
-# Define the haplogroup-to-region mapping (simple rule-based)
-import csv
-def load_haplogroup_mapping(csv_path):
-    mapping = {}
-    with open(csv_path) as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            mapping[row["haplogroup"]] = [row["region"],row["source"]]
-    return mapping
-# Function to extract haplogroup from the text
-def extract_haplogroup(text):
-    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
-    if match:
-        submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
-        if submatch:
-            return submatch.group(0)
-        else:
-            return match.group(1)  # fallback
-    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
-    if fallback:
-        return fallback.group(1)
-    return None
-# Function to extract location based on NER
-def extract_location(text):
-    doc = nlp(text)
-    locations = []
-    for ent in doc.ents:
-        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
-            locations.append(ent.text)
-    return locations
-# Function to infer location from haplogroup
-def infer_location_from_haplogroup(haplogroup):
-  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
-  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
-# Function to classify the mtDNA sample
-def classify_mtDNA_sample_from_haplo(text):
-    # Extract haplogroup
-    haplogroup = extract_haplogroup(text)
-    # Extract location based on NER
-    locations = extract_location(text)
-    # Infer location based on haplogroup
-    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
-    return {
-        "source":sourceHaplo,
-        "locations_found_in_context": locations,
-        "haplogroup": haplogroup,
-        "inferred_location": inferred_location
-    }
-# 4.3 Get from available NCBI
-def infer_location_fromNCBI(accession):
-    try:
-        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
-        text = handle.read()
-        handle.close()
-        match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
-        if match:
-            return match.group(2), match.group(0)  # This is the value like "Brunei"
-        return None
-    except Exception as e:
-        print("❌ Entrez error:", e)
-        return "",""
-# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
-def classify_sample_location(accession):
-  outputs = {}
-  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
-  # Step 1: get pubmed id and isolate
-  pubmedID, isolate = get_info_from_accession(accession)
-  if not pubmedID:
-    return {"error": f"Could not retrieve PubMed ID for accession {accession}"}
-  if not isolate:
-    isolate = "UNKNOWN_ISOLATE"
-  # Step 2: get doi
-  doi = get_doi_from_pubmed_id(pubmedID)
-  if not doi:
-    return {"error": "DOI not found for this accession. Cannot fetch paper or context."}
-  # Step 3: get text
-  '''textsToExtract = { "doiLink":"paperText"
-                        "file1.pdf":"text1",
-                        "file2.doc":"text2",
-                        "file3.xlsx":excelText3'''
-  textsToExtract = get_paper_text(doi,pubmedID)
-  if not textsToExtract:
-    return {"error": f"No texts extracted for DOI {doi}"}
-  # Step 4: prediction
-  outputs[accession] = {}
-  outputs[isolate] = {}
-  # 4.0 Infer from NCBI
-  location, outputNCBI = infer_location_fromNCBI(accession)
-  NCBI_result = {
-      "source": "NCBI",
-      "sample_id": accession,
-      "predicted_location": location,
-      "context_snippet": outputNCBI}
-  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
-  for key in textsToExtract:
-    text = textsToExtract[key]
-    # try accession number first
-    outputs[accession][key] = {}
-    keyword = accession
-    context = extract_context(text, keyword, window=500)
-    # 4.1: Using a HuggingFace model (question-answering)
-    location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
-    qa_result = {
-        "source": key,
-        "sample_id": keyword,
-        "predicted_location": location,
-        "context_snippet": context
-    }
-    outputs[keyword][key]["QAModel"] = qa_result
-    # 4.2: Infer from haplogroup
-    haplo_result = classify_mtDNA_sample_from_haplo(context)
-    outputs[keyword][key]["haplogroup"] = haplo_result
-    # try isolate
-    keyword = isolate
-    outputs[isolate][key] = {}
-    context = extract_context(text, keyword, window=500)
-    # 4.1.1: Using a HuggingFace model (question-answering)
-    location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
-    qa_result = {
-        "source": key,
-        "sample_id": keyword,
-        "predicted_location": location,
-        "context_snippet": context
-    }
-    outputs[keyword][key]["QAModel"] = qa_result
-    # 4.2.1: Infer from haplogroup
-    haplo_result = classify_mtDNA_sample_from_haplo(context)
-    outputs[keyword][key]["haplogroup"] = haplo_result
-  return outputs

+# mtDNA Location Classifier MVP (Google Colab)
+# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
+import os
+import subprocess
+import re
+from Bio import Entrez
+import fitz
+import spacy
+from spacy.cli import download
+from NER.PDF import pdf
+from NER.WordDoc import wordDoc
+from NER.html import extractHTML
+from NER.word2Vec import word2vec
+from transformers import pipeline
+import urllib.parse, requests
+from pathlib import Path
+from upgradeClassify import filter_context_for_sample, infer_location_for_sample
+# Set your email (required by NCBI Entrez)
+#Entrez.email = "[email protected]"
+import nltk
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download('punkt_tab')
+# Step 1: Get PubMed ID from Accession using EDirect
+'''def get_info_from_accession(accession):
+    cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    output = result.stdout
+    pubmedID, isolate = "", ""
+    for line in output.split("\n"):
+      if len(line) > 0:
+        if "PUBMED" in line:
+          pubmedID = line.split()[-1]
+        if "isolate" in line:  # Check for isolate information
+          # Try direct GenBank annotation: /isolate="XXX"
+          match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line)  # search on current line
+          if match1:
+            isolate = match1.group(1)
+          else:
+            # Try from DEFINITION line: ...isolate XXX...
+            match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
+            if match2:
+              isolate = match2.group(1)'''
+from Bio import Entrez, Medline
+import re
+Entrez.email = "[email protected]"
+def get_info_from_accession(accession):
+    try:
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        # Extract PUBMED ID from the Medline text
+        pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
+        pubmed_id = pubmed_match.group(1) if pubmed_match else ""
+        # Extract isolate if available
+        isolate_match = re.search(r'/isolate="([^"]+)"', text)
+        if not isolate_match:
+            isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
+        isolate = isolate_match.group(1) if isolate_match else ""
+        if not pubmed_id:
+            print(f"⚠️ No PubMed ID found for accession {accession}")
+        return pubmed_id, isolate
+    except Exception as e:
+        print("❌ Entrez error:", e)
+        return "", ""
+# Step 2: Get doi link to access the paper
+'''def get_doi_from_pubmed_id(pubmed_id):
+    cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    output = result.stdout
+    doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
+    match = re.search(doi_pattern, output, re.IGNORECASE)
+    if match:
+        return match.group(0)
+    else:
+        return None  # or raise an Exception with a helpful message'''
+def get_doi_from_pubmed_id(pubmed_id):
+    try:
+        handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
+        records = list(Medline.parse(handle))
+        handle.close()
+        if not records:
+            return None
+        record = records[0]
+        if "AID" in record:
+            for aid in record["AID"]:
+                if "[doi]" in aid:
+                    return aid.split(" ")[0]  # extract the DOI
+        return None
+    except Exception as e:
+        print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
+        return None
+# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
+# Step 3.1: Extract Text
+# sub: download excel file
+def download_excel_file(url, save_path="temp.xlsx"):
+    if "view.officeapps.live.com" in url:
+        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+        real_url = urllib.parse.unquote(parsed_url["src"][0])
+        response = requests.get(real_url)
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
+        response = requests.get(url)
+        response.raise_for_status()  # Raises error if download fails
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    else:
+        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
+        return url
+def get_paper_text(doi,id,manualLinks=None):
+  # create the temporary folder to contain the texts
+  folder_path = Path("data/"+str(id))
+  if not folder_path.exists():
+      cmd = f'mkdir data/{id}'
+      result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      print("data/"+str(id) +" created.")
+  else:
+      print("data/"+str(id) +" already exists.")
+  saveLinkFolder = "data/"+id
+  link = 'https://doi.org/' + doi
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  textsToExtract = {}
+  # get the file to create listOfFile for each id
+  html = extractHTML.HTML("",link)
+  jsonSM = html.getSupMaterial()
+  text = ""
+  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
+  if manualLinks != None:
+    links += manualLinks
+  for l in links:
+    # get the main paper
+    name = l.split("/")[-1]
+    file_path = folder_path / name
+    if l == link:
+      text = html.getListSection()
+      textsToExtract[link] = text
+    elif l.endswith(".pdf"):
+      if file_path.is_file():
+          l = saveLinkFolder + "/" + name
+          print("File exists.")
+      p = pdf.PDF(l,saveLinkFolder,doi)
+      f = p.openPDFFile()
+      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
+      doc = fitz.open(pdf_path)
+      text = "\n".join([page.get_text() for page in doc])
+      textsToExtract[l] = text
+    elif l.endswith(".doc") or l.endswith(".docx"):
+      d = wordDoc.wordDoc(l,saveLinkFolder)
+      text = d.extractTextByPage()
+      textsToExtract[l] = text
+    elif l.split(".")[-1].lower() in "xlsx":
+      wc = word2vec.word2Vec()
+      # download excel file if it not downloaded yet
+      savePath = saveLinkFolder +"/"+ l.split("/")[-1]
+      excelPath = download_excel_file(l, savePath)
+      corpus = wc.tableTransformToCorpusText([],excelPath)
+      text = ''
+      for c in corpus:
+        para = corpus[c]
+        for words in para:
+          text += " ".join(words)
+      textsToExtract[l] = text
+  # delete folder after finishing getting text
+  #cmd = f'rm -r data/{id}'
+  #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  return textsToExtract
+# Step 3.2: Extract context
+def extract_context(text, keyword, window=500):
+    # firstly try accession number
+    idx = text.find(keyword)
+    if idx == -1:
+        return "Sample ID not found."
+    return text[max(0, idx-window): idx+window]
+def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
+    if keep_if is None:
+        keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
+    outputs = ""
+    text = text.lower()
+    # If isolate is provided, prioritize paragraphs that mention it
+    # If isolate is provided, prioritize paragraphs that mention it
+    if accession and accession.lower() in text:
+        if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
+            outputs += extract_context(text, accession.lower(), window=700)
+    if isolate and isolate.lower() in text:
+        if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
+            outputs += extract_context(text, isolate.lower(), window=700)
+    for keyword in keep_if:
+        para = extract_context(text, keyword)
+        if para and para not in outputs:
+            outputs += para + "\n"
+    return outputs
+# Step 4: Classification for now (demo purposes)
+# 4.1: Using a HuggingFace model (question-answering)
+def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
+    try:
+        qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+        result = qa({"context": context, "question": question})
+        return result.get("answer", "Unknown")
+    except Exception as e:
+        return f"Error: {str(e)}"
+# 4.2: Infer from haplogroup
+# Load pre-trained spaCy model for NER
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# Define the haplogroup-to-region mapping (simple rule-based)
+import csv
+def load_haplogroup_mapping(csv_path):
+    mapping = {}
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            mapping[row["haplogroup"]] = [row["region"],row["source"]]
+    return mapping
+# Function to extract haplogroup from the text
+def extract_haplogroup(text):
+    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
+    if match:
+        submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
+        if submatch:
+            return submatch.group(0)
+        else:
+            return match.group(1)  # fallback
+    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
+    if fallback:
+        return fallback.group(1)
+    return None
+# Function to extract location based on NER
+def extract_location(text):
+    doc = nlp(text)
+    locations = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
+            locations.append(ent.text)
+    return locations
+# Function to infer location from haplogroup
+def infer_location_from_haplogroup(haplogroup):
+  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
+  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
+# Function to classify the mtDNA sample
+def classify_mtDNA_sample_from_haplo(text):
+    # Extract haplogroup
+    haplogroup = extract_haplogroup(text)
+    # Extract location based on NER
+    locations = extract_location(text)
+    # Infer location based on haplogroup
+    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
+    return {
+        "source":sourceHaplo,
+        "locations_found_in_context": locations,
+        "haplogroup": haplogroup,
+        "inferred_location": inferred_location
+    }
+# 4.3 Get from available NCBI
+def infer_location_fromNCBI(accession):
+    try:
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
+        if match:
+            return match.group(2), match.group(0)  # This is the value like "Brunei"
+        return "Not found", "Not found"
+    except Exception as e:
+        print("❌ Entrez error:", e)
+        return "Not found", "Not found"
+### ANCIENT/MODERN FLAG
+from Bio import Entrez
+import re
+def flag_ancient_modern(accession, textsToExtract, isolate=None):
+    """
+    Try to classify a sample as Ancient or Modern using:
+    1. NCBI accession (if available)
+    2. Supplementary text or context fallback
+    """
+    context = ""
+    label, explain = "", ""
+    try:
+        # Check if we can fetch metadata from NCBI using the accession
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
+        if isolate_source:
+            context += isolate_source.group(0) + " "
+        specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
+        if specimen:
+            context += specimen.group(0) + " "
+        if context.strip():
+            label, explain = detect_ancient_flag(context)
+            if label!="Unknown":
+              return label, explain + " from NCBI\n(" + context + ")"
+        # If no useful NCBI metadata, check supplementary texts
+        if textsToExtract:
+            labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
+            for source in textsToExtract:
+                text_block = textsToExtract[source]
+                context = extract_relevant_paragraphs(text_block, accession, isolate=isolate)  # Reduce to informative paragraph(s)
+                label, explain = detect_ancient_flag(context)
+                if label == "Ancient":
+                    labels["ancient"][0] += 1
+                    labels["ancient"][1] += f"{source}:\n{explain}\n\n"
+                elif label == "Modern":
+                    labels["modern"][0] += 1
+                    labels["modern"][1] += f"{source}:\n{explain}\n\n"
+                else:
+                    labels["unknown"] += 1
+            if max(labels["modern"][0],labels["ancient"][0]) > 0:
+                if labels["modern"][0] > labels["ancient"][0]:
+                    return "Modern", labels["modern"][1]
+                else:
+                    return "Ancient", labels["ancient"][1]
+            else:
+              return "Unknown", "No strong keywords detected"
+        else:
+            print("No DOI or PubMed ID available for inference.")
+            return "", ""
+    except Exception as e:
+        print("Error:", e)
+        return "", ""
+def detect_ancient_flag(context_snippet):
+    context = context_snippet.lower()
+    ancient_keywords = [
+        "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
+        "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
+        "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
+        "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
+    ]
+    modern_keywords = [
+        "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
+        "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
+        "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
+        "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
+        "bioinformatic analysis", "samples from", "population genetics", "genome-wide data"
+    ]
+    ancient_hits = [k for k in ancient_keywords if k in context]
+    modern_hits = [k for k in modern_keywords if k in context]
+    if ancient_hits and not modern_hits:
+        return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
+    elif modern_hits and not ancient_hits:
+        return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
+    elif ancient_hits and modern_hits:
+        if len(ancient_hits) >= len(modern_hits):
+            return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
+        else:
+            return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
+    # Fallback to QA
+    answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
+    if answer.startswith("Error"):
+        return "Unknown", answer
+    if "ancient" in answer.lower():
+        return "Ancient", f"Leaning ancient based on QA: {answer}"
+    elif "modern" in answer.lower():
+        return "Modern", f"Leaning modern based on QA: {answer}"
+    else:
+        return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
+# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
+def classify_sample_location(accession):
+  outputs = {}
+  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
+  # Step 1: get pubmed id and isolate
+  pubmedID, isolate = get_info_from_accession(accession)
+  '''if not pubmedID:
+    return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
+  if not isolate:
+    isolate = "UNKNOWN_ISOLATE"
+  # Step 2: get doi
+  doi = get_doi_from_pubmed_id(pubmedID)
+  '''if not doi:
+    return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
+  # Step 3: get text
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  if doi and pubmedID:
+    textsToExtract = get_paper_text(doi,pubmedID)
+  else: textsToExtract = {}
+  '''if not textsToExtract:
+    return {"error": f"No texts extracted for DOI {doi}"}'''
+  if isolate not in [None, "UNKNOWN_ISOLATE"]:
+    label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
+  else:
+    label, explain = flag_ancient_modern(accession,textsToExtract)
+  # Step 4: prediction
+  outputs[accession] = {}
+  outputs[isolate] = {}
+  # 4.0 Infer from NCBI
+  location, outputNCBI = infer_location_fromNCBI(accession)
+  NCBI_result = {
+      "source": "NCBI",
+      "sample_id": accession,
+      "predicted_location": location,
+      "context_snippet": outputNCBI}
+  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
+  if textsToExtract:
+    long_text = ""
+    for key in textsToExtract:
+      text = textsToExtract[key]
+      # try accession number first
+      outputs[accession][key] = {}
+      keyword = accession
+      context = extract_context(text, keyword, window=500)
+      # 4.1: Using a HuggingFace model (question-answering)
+      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+      qa_result = {
+          "source": key,
+          "sample_id": keyword,
+          "predicted_location": location,
+          "context_snippet": context
+      }
+      outputs[keyword][key]["QAModel"] = qa_result
+      # 4.2: Infer from haplogroup
+      haplo_result = classify_mtDNA_sample_from_haplo(context)
+      outputs[keyword][key]["haplogroup"] = haplo_result
+      # try isolate
+      keyword = isolate
+      outputs[isolate][key] = {}
+      context = extract_context(text, keyword, window=500)
+      # 4.1.1: Using a HuggingFace model (question-answering)
+      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+      qa_result = {
+          "source": key,
+          "sample_id": keyword,
+          "predicted_location": location,
+          "context_snippet": context
+      }
+      outputs[keyword][key]["QAModel"] = qa_result
+      # 4.2.1: Infer from haplogroup
+      haplo_result = classify_mtDNA_sample_from_haplo(context)
+      outputs[keyword][key]["haplogroup"] = haplo_result
+      # add long text
+      long_text += text + ". \n"
+    # 4.3: UpgradeClassify
+    # try sample_id as accession number
+    sample_id = accession
+    if sample_id:
+      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
+      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
+      if locations!="No clear location found in top matches":
+        outputs[sample_id]["upgradeClassifier"] = {}
+        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
+          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
+          "sample_id": sample_id,
+          "predicted_location": ", ".join(locations),
+          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
+        }
+    # try sample_id as isolate name
+    sample_id = isolate
+    if sample_id:
+      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
+      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
+      if locations!="No clear location found in top matches":
+        outputs[sample_id]["upgradeClassifier"] = {}
+        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
+          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
+          "sample_id": sample_id,
+          "predicted_location": ", ".join(locations),
+          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
+        }
+  return outputs, label, explain

mtdna_ui.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import gradio as gr
+from mtdna_backend import *
+import json
+# Gradio UI
+with gr.Blocks() as interface:
+    gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
+    inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
+    with gr.Group() as single_input_group:
+        single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
+    with gr.Group(visible=False) as batch_input_group:
+        raw_text = gr.Textbox(label="🧬 Paste Accession Numbers (e.g., MF362736.1,MF362738.1,KU131308,MW291678)")
+        gr.HTML("""<a href="https://drive.google.com/file/d/1t-TFeIsGVu5Jh3CUZS-VE9jQWzNFCs_c/view?usp=sharing" download target="_blank">Download Example CSV Format</a>""")
+        gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing&ouid=112390323314156876153&rtpof=true&sd=true" download target="_blank">Download Example Excel Format</a>""")
+        file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
+    with gr.Row():
+        run_button = gr.Button("🔍 Submit and Classify")
+        reset_button = gr.Button("🔄 Reset")
+    status = gr.Markdown(visible=False)
+    with gr.Group(visible=False) as results_group:
+      with gr.Accordion("Open to See the Result", open=False) as results:
+          with gr.Row():
+              output_summary = gr.Markdown(elem_id="output-summary")
+              output_flag = gr.Markdown(elem_id="output-flag")
+          gr.Markdown("---")
+      with gr.Accordion("Open to See the Output Table", open=False) as table_accordion:
+          """output_table = gr.Dataframe(
+              headers=["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"],
+              interactive=False,
+              row_count=(5, "dynamic")
+          )"""
+          output_table = gr.HTML(render=True)
+      with gr.Row():
+          output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
+          download_button = gr.Button("⬇️ Download Output")
+      download_file = gr.File(label="Download File Here",visible=False)
+      gr.Markdown("---")
+      gr.Markdown("### 💬 Feedback (required)")
+      q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
+      q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
+      contact = gr.Textbox(label="📧 Your email or institution (optional)")
+      submit_feedback = gr.Button("✅ Submit Feedback")
+      feedback_status = gr.Markdown()
+    # Functions
+    def toggle_input_mode(mode):
+        if mode == "Single Accession":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    def classify_with_loading():
+        return gr.update(value="⏳ Please wait... processing...",visible=True)  # Show processing message
+    def classify_dynamic(single_accession, file, text, mode):
+        if mode == "Single Accession":
+            return classify_main(single_accession)  + (gr.update(visible=False),)
+        else:
+            #return summarize_batch(file, text) + (gr.update(visible=False),)  # Hide processing message
+            return classify_mulAcc(file, text) + (gr.update(visible=False),)  # Hide processing message
+    # for single accession
+    def classify_main(accession):
+        table, summary, labelAncient_Modern, explain_label = summarize_results(accession)
+        flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
+        return (
+            #table,
+            make_html_table(table),
+            summary,
+            flag_output,
+            gr.update(visible=True),
+            gr.update(visible=False)
+        )
+    # for batch accessions
+    def classify_mulAcc(file, text):
+        table, summary, flag_output, gr1, gr2 = summarize_batch(file, text)
+        #flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
+        return (
+            #table,
+            make_html_table(table),
+            summary,
+            flag_output,
+            gr.update(visible=True),
+            gr.update(visible=False)
+        )
+    def make_html_table(rows):
+      html = """
+      <div style='overflow-x: auto; padding: 10px;'>
+          <div style='max-height: 400px; overflow-y: auto; border: 1px solid #444; border-radius: 8px;'>
+              <table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: #f1f1f1; background-color: #1e1e1e;'>
+                  <thead style='position: sticky; top: 0; background-color: #2c2c2c; z-index: 1;'>
+                      <tr>
+      """
+      headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
+      html += "".join(
+          f"<th style='padding: 10px; border: 1px solid #555; text-align: left; white-space: nowrap;'>{h}</th>"
+          for h in headers
+      )
+      html += "</tr></thead><tbody>"
+      for row in rows:
+          html += "<tr>"
+          for i, col in enumerate(row):
+              header = headers[i]
+              style = "padding: 10px; border: 1px solid #555; vertical-align: top;"
+              # For specific columns like Haplogroup, force nowrap
+              if header in ["Haplogroup", "Sample ID", "Technique"]:
+                  style += " white-space: nowrap; text-overflow: ellipsis; max-width: 200px; overflow: hidden;"
+              if header == "Source" and isinstance(col, str) and col.strip().lower().startswith("http"):
+                  col = f"<a href='{col}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{col}</a>"
+              html += f"<td style='{style}'>{col}</td>"
+          html += "</tr>"
+      html += "</tbody></table></div></div>"
+      return html
+    def reset_fields():
+        return (
+            gr.update(value=""),  # single_accession
+            gr.update(value=""),  # raw_text
+            gr.update(value=None), # file_upload
+            gr.update(value="Single Accession"), # inputMode
+            gr.update(value=[], visible=True), # output_table
+            gr.update(value="", visible=True), # output_summary
+            gr.update(value="", visible=True), # output_flag
+            gr.update(visible=False), # status
+            gr.update(visible=False)  # results_group
+        )
+    inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
+    run_button.click(fn=classify_with_loading, inputs=[], outputs=[status])
+    run_button.click(
+        fn=classify_dynamic,
+        inputs=[single_accession, file_upload, raw_text, inputMode],
+        outputs=[output_table, output_summary, output_flag, results_group, status]
+    )
+    reset_button.click(
+        fn=reset_fields,
+        inputs=[],
+        outputs=[
+            single_accession, raw_text, file_upload, inputMode,
+            output_table, output_summary, output_flag,
+            status, results_group
+        ]
+    )
+    download_button.click(
+      fn=save_batch_output,
+      inputs=[output_table, output_summary, output_flag, output_type],
+      outputs=[download_file])
+    submit_feedback.click(
+        fn=store_feedback_to_google_sheets, inputs=[single_accession, q1, q2, contact], outputs=feedback_status
+    )
+        # Custom CSS styles
+    gr.HTML("""
+    <style>
+      /* Ensures both sections are equally spaced with the same background size */
+      #output-summary, #output-flag {
+          background-color: #f0f4f8; /* Light Grey for both */
+          padding: 20px;
+          border-radius: 10px;
+          margin-top: 10px;
+          width: 100%; /* Ensure full width */
+          min-height: 150px; /* Ensures both have a minimum height */
+          box-sizing: border-box; /* Prevents padding from increasing size */
+          display: flex;
+          flex-direction: column;
+          justify-content: space-between;
+      }
+      /* Specific background colors */
+      #output-summary {
+          background-color: #434a4b;
+      }
+      #output-flag {
+          background-color: #141616;
+      }
+      /* Ensuring they are in a row and evenly spaced */
+      .gradio-row {
+          display: flex;
+          justify-content: space-between;
+          width: 100%;
+      }
+    </style>
+    """)
+interface.launch(share=True,debug=True)

output.json ADDED Viewed

	@@ -0,0 +1,276 @@

+{
+  "Detailed_Results": [
+    [
+      "MF362736.1 (accession number)",
+      "NCBI",
+      "NCBI",
+      "Armenia",
+      "",
+      "",
+      "/geo_loc_name=\"Armenia\""
+    ],
+    [
+      "MF362736.1 (accession number)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "MF362736.1 (accession number)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "rise396_mt (isolate of accession)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "rise396_mt (isolate of accession)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "MF362738.1 (accession number)",
+      "NCBI",
+      "NCBI",
+      "Armenia",
+      "",
+      "",
+      "/geo_loc_name=\"Armenia\""
+    ],
+    [
+      "MF362738.1 (accession number)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "MF362738.1 (accession number)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "rise407_mt (isolate of accession)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "rise407_mt (isolate of accession)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "MF362739.1 (accession number)",
+      "NCBI",
+      "NCBI",
+      "Armenia",
+      "",
+      "",
+      "/geo_loc_name=\"Armenia\""
+    ],
+    [
+      "MF362739.1 (accession number)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "MF362739.1 (accession number)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "rise408_mt (isolate of accession)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1016/j.cub.2017.05.087\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1016/j.cub.2017.05.087</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "rise408_mt (isolate of accession)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "KU131308 (accession number)",
+      "NCBI",
+      "NCBI",
+      "Brunei",
+      "",
+      "",
+      "/geo_loc_name=\"Brunei\""
+    ],
+    [
+      "KU131308 (accession number)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1007/s00439-015-1620-z\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1007/s00439-015-1620-z</a>",
+      "GenBank",
+      "",
+      "",
+      "t (unavailable at the start of this study). We performed whole-mtDNA sequencing as previously described (Torroni et al. 2001) using an ABI 48-capillary 3730 DNA Analyser (Taipei) an ABI 16-capillary 3130XL DNA Analyser (Leeds) and an ABI 16-capillary 3100 DNA Analyser (Porto). Details on the new and"
+    ],
+    [
+      "KU131308 (accession number)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: EMPOP",
+      "",
+      "M7",
+      "East Asia",
+      ""
+    ],
+    [
+      "KU131308 (accession number)",
+      "QAModel",
+      "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "KU131308 (accession number)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "KU131308 (accession number)",
+      "QAModel",
+      "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "KU131308 (accession number)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "BRU18 (isolate of accession)",
+      "QAModel",
+      "<a href=\"https://doi.org/10.1007/s00439-015-1620-z\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://doi.org/10.1007/s00439-015-1620-z</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "BRU18 (isolate of accession)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "BRU18 (isolate of accession)",
+      "QAModel",
+      "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>",
+      "Borneo",
+      "",
+      "",
+      ", NA18138, NA18149, NA18152, \nNA18674, NA18707 \n Chinese in Denver, \nUSA \n[86] \nNA17971, NA18124, NA18550, NA18574, NA18582, \nNA18618, NA18636, NA18638, NA18639, NA18644, \nNA18756, NA18769, NA18771 \nHan Chinese in Beijing  \n[86] \nNA18755 \nBeijing Han Chinese \n[86] \nNA18940, NA18943, NA18952, NA18953"
+    ],
+    [
+      "BRU18 (isolate of accession)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Denver",
+      "Unknown",
+      ""
+    ],
+    [
+      "BRU18 (isolate of accession)",
+      "QAModel",
+      "<a href=\"https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls\" target=\"_blank\" style=\"color: blue; text-decoration: underline;\">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>",
+      "Sample ID not found",
+      "",
+      "",
+      "Sample ID not found."
+    ],
+    [
+      "BRU18 (isolate of accession)",
+      "haplogroup",
+      "The region of haplogroup is inferred\nby using this source: Unknown",
+      "",
+      "Sample",
+      "Unknown",
+      ""
+    ],
+    [
+      "MW291678 (accession number)",
+      "NCBI",
+      "NCBI",
+      "Argentina",
+      "",
+      "",
+      "/geo_loc_name=\"Argentina\""
+    ],
+    [
+      "MN006856 (accession number)",
+      "NCBI",
+      "NCBI",
+      "Not found",
+      "",
+      "",
+      "Not found"
+    ]
+  ],
+  "Summary_Text": "**MF362736.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**MF362738.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**MF362739.1**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Armenia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Armenia** (mentioned 1 times)\n\n---\n\n**KU131308**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Brunei**: 1 times\n- **GenBank**: 1 times\n- **Borneo**: 1 times\n- **East Asia**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Brunei** (mentioned 1 times)\n\n---\n\n**MW291678**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Argentina**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Argentina** (mentioned 1 times)\n\n---\n\n**MN006856**\n### \ud83e\udded Location Frequency Summary\nAfter counting all predicted and inferred locations:\n\n- **Not found**: 1 times\n\n**Final Suggested Location:** \ud83d\uddfa\ufe0f **Not found** (mentioned 1 times)",
+  "Ancient_Modern_Flag": "**MF362736.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ Flagged as ancient due to keywords: tomb, skeleton from NCBI\n(/isolation_source=\"Tomb 6; skeleton 1\" /specimen_voucher=\"Kapan;Tomb 6; skeleton 1\" )\n\n---\n\n**MF362738.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:\nMixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site\n\n\n\n---\n\n**MF362739.1**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:\nMixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site\n\n\n\n---\n\n**KU131308**\n### \ud83c\udffa Ancient/Modern Flag\n**Modern**\n\n_Explanation:_ https://doi.org/10.1007/s00439-015-1620-z:\nMixed context, leaning modern due to: we analysed, new sequences, published data, sink population, genome-wide data\n\n\n\n---\n\n**MW291678**\n### \ud83c\udffa Ancient/Modern Flag\n**Ancient**\n\n_Explanation:_ Flagged as ancient due to keywords: archaeological from NCBI\n(/isolation_source=\"archaeological human bone\" )\n\n---\n\n**MN006856**\n### \ud83c\udffa Ancient/Modern Flag\n****\n\n_Explanation:_ "
+}None

output.txt ADDED Viewed

	@@ -0,0 +1,176 @@

+=== Detailed Results ===
+MF362736.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
+MF362736.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
+MF362736.1 (accession number), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+rise396_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
+rise396_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+MF362738.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
+MF362738.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
+MF362738.1 (accession number), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+rise407_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
+rise407_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+MF362739.1 (accession number), NCBI, NCBI, Armenia, /geo_loc_name="Armenia"
+MF362739.1 (accession number), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
+MF362739.1 (accession number), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+rise408_mt (isolate of accession), QAModel, <a href="https://doi.org/10.1016/j.cub.2017.05.087" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1016/j.cub.2017.05.087</a>, Sample ID not found, Sample ID not found.
+rise408_mt (isolate of accession), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+KU131308 (accession number), NCBI, NCBI, Brunei, /geo_loc_name="Brunei"
+KU131308 (accession number), QAModel, <a href="https://doi.org/10.1007/s00439-015-1620-z" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1007/s00439-015-1620-z</a>, GenBank, t (unavailable at the start of this study). We performed whole-mtDNA sequencing as previously described (Torroni et al. 2001) using an ABI 48-capillary 3730 DNA Analyser (Taipei) an ABI 16-capillary 3130XL DNA Analyser (Leeds) and an ABI 16-capillary 3100 DNA Analyser (Porto). Details on the new and
+KU131308 (accession number), haplogroup, The region of haplogroup is inferred
+by using this source: EMPOP, M7, East Asia
+KU131308 (accession number), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>, Sample ID not found, Sample ID not found.
+KU131308 (accession number), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+KU131308 (accession number), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>, Sample ID not found, Sample ID not found.
+KU131308 (accession number), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+BRU18 (isolate of accession), QAModel, <a href="https://doi.org/10.1007/s00439-015-1620-z" target="_blank" style="color: blue; text-decoration: underline;">https://doi.org/10.1007/s00439-015-1620-z</a>, Sample ID not found, Sample ID not found.
+BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+BRU18 (isolate of accession), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf</a>, Borneo, , NA18138, NA18149, NA18152,
+NA18674, NA18707
+ Chinese in Denver,
+USA
+[86]
+NA17971, NA18124, NA18550, NA18574, NA18582,
+NA18618, NA18636, NA18638, NA18639, NA18644,
+NA18756, NA18769, NA18771
+Han Chinese in Beijing
+[86]
+NA18755
+Beijing Han Chinese
+[86]
+NA18940, NA18943, NA18952, NA18953
+BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Denver, Unknown
+BRU18 (isolate of accession), QAModel, <a href="https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls" target="_blank" style="color: blue; text-decoration: underline;">https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls</a>, Sample ID not found, Sample ID not found.
+BRU18 (isolate of accession), haplogroup, The region of haplogroup is inferred
+by using this source: Unknown, Sample, Unknown
+MW291678 (accession number), NCBI, NCBI, Argentina, /geo_loc_name="Argentina"
+MN006856 (accession number), NCBI, NCBI, Not found, Not found
+=== Summary ===
+**MF362736.1**
+### 🧭 Location Frequency Summary
+After counting all predicted and inferred locations:
+- **Armenia**: 1 times
+**Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
+---
+**MF362738.1**
+### 🧭 Location Frequency Summary
+After counting all predicted and inferred locations:
+- **Armenia**: 1 times
+**Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
+---
+**MF362739.1**
+### 🧭 Location Frequency Summary
+After counting all predicted and inferred locations:
+- **Armenia**: 1 times
+**Final Suggested Location:** 🗺️ **Armenia** (mentioned 1 times)
+---
+**KU131308**
+### 🧭 Location Frequency Summary
+After counting all predicted and inferred locations:
+- **Brunei**: 1 times
+- **GenBank**: 1 times
+- **Borneo**: 1 times
+- **East Asia**: 1 times
+**Final Suggested Location:** 🗺️ **Brunei** (mentioned 1 times)
+---
+**MW291678**
+### 🧭 Location Frequency Summary
+After counting all predicted and inferred locations:
+- **Argentina**: 1 times
+**Final Suggested Location:** 🗺️ **Argentina** (mentioned 1 times)
+---
+**MN006856**
+### 🧭 Location Frequency Summary
+After counting all predicted and inferred locations:
+- **Not found**: 1 times
+**Final Suggested Location:** 🗺️ **Not found** (mentioned 1 times)
+=== Ancient/Modern Flag ===
+**MF362736.1**
+### 🏺 Ancient/Modern Flag
+**Ancient**
+_Explanation:_ Flagged as ancient due to keywords: tomb, skeleton from NCBI
+(/isolation_source="Tomb 6; skeleton 1" /specimen_voucher="Kapan;Tomb 6; skeleton 1" )
+---
+**MF362738.1**
+### 🏺 Ancient/Modern Flag
+**Ancient**
+_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:
+Mixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site
+---
+**MF362739.1**
+### 🏺 Ancient/Modern Flag
+**Ancient**
+_Explanation:_ https://doi.org/10.1016/j.cub.2017.05.087:
+Mixed context, leaning ancient due to: ancient, archaeological, bronze age, iron age, tomb, skeleton, carbon dating, adna, site
+---
+**KU131308**
+### 🏺 Ancient/Modern Flag
+**Modern**
+_Explanation:_ https://doi.org/10.1007/s00439-015-1620-z:
+Mixed context, leaning modern due to: we analysed, new sequences, published data, sink population, genome-wide data
+---
+**MW291678**
+### 🏺 Ancient/Modern Flag
+**Ancient**
+_Explanation:_ Flagged as ancient due to keywords: archaeological from NCBI
+(/isolation_source="archaeological human bone" )
+---
+**MN006856**
+### 🏺 Ancient/Modern Flag
+****
+_Explanation:_

requirements.txt CHANGED Viewed

@@ -1,24 +1,29 @@
-gradio
-transformers
-torch
-pandas
-scikit-learn
-spacy
-pymupdf
-requests
-biopython
-bs4
-pdfreader
-tabula-py
-spire.doc
-Spire.XLS
-thefuzz
-wordsegment
-spacy
-spacy-lookups-data
-gensim
-xlrd>=2.0.1
-openpyxl
-gspread
-oauth2client
-nltk

+biopython==1.85
+bs4==0.0.2
+gensim==4.3.3
+gradio==5.29.0
+gspread==6.2.0
+gspread-dataframe==4.0.0
+huggingface-hub==0.30.2
+nltk==3.9.1
+oauth2client==4.1.3
+openai==1.76.2
+openpyxl==3.1.5
+pandas==2.2.2
+pdfreader==0.1.15
+PyMuPDF==1.25.5
+pytest==8.3.5
+requests==2.32.3
+scikit-learn==1.6.1
+scipy==1.13.1
+spacy==3.8.5
+spacy-lookups-data==1.0.5
+spire-doc==13.4.6
+Spire.Xls==14.12.0
+statsmodels==0.14.4
+tabula-py==2.10.0
+thefuzz==0.22.1
+torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl
+transformers==4.51.3
+wordsegment==1.3.1
+xlrd==2.0.1

setup.sh CHANGED Viewed

@@ -1,8 +1,8 @@
-#!/bin/bash
-# Install EDirect tools and set up PATH
-yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
-echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
-export PATH=$HOME/edirect:$PATH

+#!/bin/bash
+# Install EDirect tools and set up PATH
+yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
+echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
+export PATH=$HOME/edirect:$PATH

standardize_location.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import requests
+import re
+# Normalize input
+def normalize_key(text):
+    return re.sub(r"[^a-z0-9]", "", text.strip().lower())
+# Search for city/place (normal flow)
+def get_country_from_geonames(city_name, username="vyphung"):
+    url = "http://api.geonames.org/searchJSON"
+    params = {
+        "q": city_name,
+        "maxRows": 1,
+        "username": username
+    }
+    try:
+        r = requests.get(url, params=params, timeout=5)
+        data = r.json()
+        if data.get("geonames"):
+            return data["geonames"][0]["countryName"]
+    except Exception as e:
+        print("GeoNames searchJSON error:", e)
+    return None
+# Search for country info using alpha-2/3 codes or name
+def get_country_from_countryinfo(input_code, username="vyphung"):
+    url = "http://api.geonames.org/countryInfoJSON"
+    params = {
+        "username": username
+    }
+    try:
+        r = requests.get(url, params=params, timeout=5)
+        data = r.json()
+        if data.get("geonames"):
+            input_code = input_code.strip().upper()
+            for country in data["geonames"]:
+                # Match against country name, country code (alpha-2), iso alpha-3
+                if input_code in [
+                    country.get("countryName", "").upper(),
+                    country.get("countryCode", "").upper(),
+                    country.get("isoAlpha3", "").upper()
+                ]:
+                    return country["countryName"]
+    except Exception as e:
+        print("GeoNames countryInfoJSON error:", e)
+    return None
+# Combined smart lookup
+def smart_country_lookup(user_input, username="vyphung"):
+    raw_input = user_input.strip()
+    normalized = re.sub(r"[^a-zA-Z0-9]", "", user_input).upper()  # normalize for codes (no strip spaces!)
+    # Special case: if user writes "UK: London" → split and take main country part
+    if ":" in raw_input:
+        raw_input = raw_input.split(":")[0].strip()  # only take "UK"
+    # First try as country code (if 2-3 letters or common abbreviation)
+    if len(normalized) <= 3:
+      if normalized.upper() in ["UK","U.K","U.K."]:
+        country = get_country_from_geonames(normalized.upper(), username=username)
+        if country:
+          return country
+      else:
+        country = get_country_from_countryinfo(raw_input, username=username)
+        if country:
+            return country
+    country = get_country_from_countryinfo(raw_input, username=username)  # try full names
+    if country:
+        return country
+    # Otherwise, treat as city/place
+    country = get_country_from_geonames(raw_input, username=username)
+    if country:
+        return country
+    return "Not found"

upgradeClassify.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import re
+import spacy
+from nltk.tokenize import sent_tokenize, word_tokenize
+import nltk
+nltk.download('punkt_tab')
+#import coreferee
+import copy
+from sentence_transformers import SentenceTransformer, util
+from sklearn.cluster import DBSCAN
+from sklearn.metrics.pairwise import cosine_distances
+from collections import defaultdict
+import numpy as np
+#from mtdna_classifier import infer_fromQAModel
+# 1. SENTENCE-BERT MODEL
+# Step 1: Preprocess the text
+def normalize_text(text):
+    # Normalize various separators to "-"
+    text = re.sub(r'\s*(–+|—+|--+>|–>|->|-->|to|→|➝|➔|➡)\s*', '-', text, flags=re.IGNORECASE)
+    # Fix GEN10GEN30 → GEN10-GEN30
+    text = re.sub(r'\b([a-zA-Z]+)(\d+)(\1)(\d+)\b', r'\1\2-\1\4', text)
+    # Fix GEN10-30 → GEN10-GEN30
+    text = re.sub(r'\b([a-zA-Z]+)(\d+)-(\d+)\b', r'\1\2-\1\3', text)
+    return text
+def preprocess_text(text):
+    normalized = normalize_text(text)
+    sentences = sent_tokenize(normalized)
+    return [re.sub(r"[^a-zA-Z0-9\s\-]", "", s).strip() for s in sentences]
+# Before step 2, check NLP cache to avoid calling it muliple times:
+# Global model cache
+_spacy_models = {}
+def get_spacy_model(model_name, add_coreferee=False):
+    global _spacy_models
+    if model_name not in _spacy_models:
+        nlp = spacy.load(model_name)
+        if add_coreferee and "coreferee" not in nlp.pipe_names:
+            nlp.add_pipe("coreferee")
+        _spacy_models[model_name] = nlp
+    return _spacy_models[model_name]
+# Step 2: NER to Extract Locations and Sample Names
+def extract_entities(text, sample_id=None):
+    nlp = get_spacy_model("en_core_web_sm")
+    doc = nlp(text)
+    # Filter entities by GPE, but exclude things that match sample ID format
+    gpe_candidates = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+    # Remove entries that match SAMPLE ID patterns like XXX123 or similar
+    gpe_filtered = [gpe for gpe in gpe_candidates if not re.fullmatch(r'[A-Z]{2,5}\d{2,4}', gpe.strip())]
+    # Optional: further filter known invalid patterns (e.g., things shorter than 3 chars, numeric only)
+    gpe_filtered = [gpe for gpe in gpe_filtered if len(gpe) > 2 and not gpe.strip().isdigit()]
+    if sample_id is None:
+        return list(set(gpe_filtered)), []
+    else:
+        sample_prefix = re.match(r'[A-Z]+', sample_id).group()
+        samples = re.findall(rf'{sample_prefix}\d+', text)
+        return list(set(gpe_filtered)), list(set(samples))
+# Step 3: Build a Soft Matching Layer
+# Handle patterns like "BRU1–BRU20" and identify BRU18 as part of it.
+def is_sample_in_range(sample_id, sentence):
+    # Match prefix up to digits
+    sample_prefix_match = re.match(r'^([A-Z0-9]+?)(?=\d+$)', sample_id)
+    sample_number_match = re.search(r'(\d+)$', sample_id)
+    if not sample_prefix_match or not sample_number_match:
+        return False
+    sample_prefix = sample_prefix_match.group(1)
+    sample_number = int(sample_number_match.group(1))
+    sentence = normalize_text(sentence)
+    # Case 1: Full prefix on both sides
+    pattern1 = rf'{sample_prefix}(\d+)\s*-\s*{sample_prefix}(\d+)'
+    for match in re.findall(pattern1, sentence):
+        start, end = int(match[0]), int(match[1])
+        if start <= sample_number <= end:
+            return True
+    # Case 2: Prefix only on first number
+    pattern2 = rf'{sample_prefix}(\d+)\s*-\s*(\d+)'
+    for match in re.findall(pattern2, sentence):
+        start, end = int(match[0]), int(match[1])
+        if start <= sample_number <= end:
+            return True
+    return False
+# Step 4: Use coreferree to merge the sentences have same coreference # still cannot cause packages conflict
+# ========== HEURISTIC GROUP → LOCATION MAPPERS ==========
+# === Generalized version to replace your old extract_sample_to_group_general ===
+# === Generalized version to replace your old extract_group_to_location_general ===
+def extract_population_locations(text):
+    text = normalize_text(text)
+    pattern = r'([A-Za-z ,\-]+)\n([A-Z]+\d*)\n([A-Za-z ,\-]+)\n([A-Za-z ,\-]+)'
+    pop_to_location = {}
+    for match in re.finditer(pattern, text, flags=re.IGNORECASE):
+        _, pop_code, region, country = match.groups()
+        pop_to_location[pop_code.upper()] = f"{region.strip()}\n{country.strip()}"
+    return pop_to_location
+def extract_sample_ranges(text):
+    text = normalize_text(text)
+    # Updated pattern to handle punctuation and line breaks
+    pattern = r'\b([A-Z0-9]+\d+)[–\-]([A-Z0-9]+\d+)[,:\.\s]*([A-Z0-9]+\d+)\b'
+    sample_to_pop = {}
+    for match in re.finditer(pattern, text, flags=re.IGNORECASE):
+        start_id, end_id, pop_code = match.groups()
+        start_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', start_id, re.IGNORECASE).group(1).upper()
+        end_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', end_id, re.IGNORECASE).group(1).upper()
+        if start_prefix != end_prefix:
+            continue
+        start_num = int(re.search(r'(\d+)$', start_id).group())
+        end_num = int(re.search(r'(\d+)$', end_id).group())
+        for i in range(start_num, end_num + 1):
+            sample_id = f"{start_prefix}{i:03d}"
+            sample_to_pop[sample_id] = pop_code.upper()
+    return sample_to_pop
+def filter_context_for_sample(sample_id, full_text, window_size=2):
+    # Normalize and tokenize
+    full_text = normalize_text(full_text)
+    sentences = sent_tokenize(full_text)
+    # Step 1: Find indices with direct mention or range match
+    match_indices = [
+        i for i, s in enumerate(sentences)
+        if sample_id in s or is_sample_in_range(sample_id, s)
+    ]
+    # Step 2: Get sample → group mapping from full text
+    sample_to_group = extract_sample_ranges(full_text)
+    group_id = sample_to_group.get(sample_id)
+    # Step 3: Find group-related sentences
+    group_indices = []
+    if group_id:
+        for i, s in enumerate(sentences):
+            if group_id in s:
+                group_indices.append(i)
+    # Step 4: Collect sentences within window
+    selected_indices = set()
+    if len(match_indices + group_indices) > 0:
+      for i in match_indices + group_indices:
+          start = max(0, i - window_size)
+          end = min(len(sentences), i + window_size + 1)
+          selected_indices.update(range(start, end))
+      filtered_sentences = [sentences[i] for i in sorted(selected_indices)]
+      return " ".join(filtered_sentences)
+    return full_text
+# Load the SpaCy transformer model with coreferee
+def mergeCorefSen(text):
+  sen = preprocess_text(text)
+  return sen
+# Before step 5 and below, let check transformer cache to avoid calling again
+# Global SBERT model cache
+_sbert_models = {}
+def get_sbert_model(model_name="all-MiniLM-L6-v2"):
+    global _sbert_models
+    if model_name not in _sbert_models:
+        _sbert_models[model_name] = SentenceTransformer(model_name)
+    return _sbert_models[model_name]
+# Step 5: Sentence-BERT retriever → Find top paragraphs related to keyword.
+'''Use sentence transformers to embed the sentence that mentions the sample and
+compare it to sentences that mention locations.'''
+def find_top_para(sample_id, text,top_k=5):
+    sentences = mergeCorefSen(text)
+    model = get_sbert_model("all-mpnet-base-v2")
+    embeddings = model.encode(sentences, convert_to_tensor=True)
+    # Find the sentence that best matches the sample_id
+    sample_matches = [s for s in sentences if sample_id in s or is_sample_in_range(sample_id, s)]
+    if not sample_matches:
+        return [],"No context found for sample"
+    sample_embedding = model.encode(sample_matches[0], convert_to_tensor=True)
+    cos_scores = util.pytorch_cos_sim(sample_embedding, embeddings)[0]
+    # Get top-k most similar sentence indices
+    top_indices = cos_scores.argsort(descending=True)[:top_k]
+    return top_indices, sentences
+# Step 6: DBSCAN to cluster the group of similar paragraphs.
+def clusterPara(tokens):
+  # Load Sentence-BERT model
+  sbert_model = get_sbert_model("all-mpnet-base-v2")
+  sentence_embeddings = sbert_model.encode(tokens)
+  # Compute cosine distance matrix
+  distance_matrix = cosine_distances(sentence_embeddings)
+  # DBSCAN clustering
+  clustering_model = DBSCAN(eps=0.3, min_samples=1, metric="precomputed")
+  cluster_labels = clustering_model.fit_predict(distance_matrix)
+  # Group sentences by cluster
+  clusters = defaultdict(list)
+  cluster_embeddings = defaultdict(list)
+  sentence_to_cluster = {}
+  for i, label in enumerate(cluster_labels):
+    clusters[label].append(tokens[i])
+    cluster_embeddings[label].append(sentence_embeddings[i])
+    sentence_to_cluster[tokens[i]] = label
+  # Compute cluster centroids
+  centroids = {
+      label: np.mean(embs, axis=0)
+      for label, embs in cluster_embeddings.items()
+  }
+  return clusters, sentence_to_cluster, centroids
+def rankSenFromCluster(clusters, sentence_to_cluster, centroids, target_sentence):
+  target_cluster = sentence_to_cluster[target_sentence]
+  target_centroid = centroids[target_cluster]
+  sen_rank = []
+  sen_order = list(sentence_to_cluster.keys())
+  # Compute distances to other cluster centroids
+  dists = []
+  for label, centroid in centroids.items():
+    dist = cosine_distances([target_centroid], [centroid])[0][0]
+    dists.append((label, dist))
+  dists.sort(key=lambda x: x[1])  # sort by proximity
+  for d in dists:
+    cluster = clusters[d[0]]
+    for sen in cluster:
+      if sen != target_sentence:
+        sen_rank.append(sen_order.index(sen))
+  return sen_rank
+# Step 7: Final Inference Wrapper
+def infer_location_for_sample(sample_id, context_text):
+    # Go through each of the top sentences in order
+    top_indices, sentences = find_top_para(sample_id, context_text,top_k=5)
+    if top_indices==[] or sentences == "No context found for sample":
+      return "No clear location found in top matches"
+    clusters, sentence_to_cluster, centroids = clusterPara(sentences)
+    topRankSen_DBSCAN = []
+    mostTopSen = ""
+    locations = ""
+    i = 0
+    while len(locations) == 0 or i < len(top_indices):
+      # Firstly, start with the top-ranked Sentence-BERT result
+      idx = top_indices[i]
+      best_sentence = sentences[idx]
+      if i == 0:
+        mostTopSen = best_sentence
+      locations, _ = extract_entities(best_sentence, sample_id)
+      if locations:
+        return locations
+      # If no location, then look for sample overlap in the same DBSCAN cluster
+      # Compute distances to other cluster centroids
+      if len(topRankSen_DBSCAN)==0 and mostTopSen:
+        topRankSen_DBSCAN = rankSenFromCluster(clusters, sentence_to_cluster, centroids, mostTopSen)
+      if i >= len(topRankSen_DBSCAN): break
+      idx_DBSCAN = topRankSen_DBSCAN[i]
+      best_sentence_DBSCAN = sentences[idx_DBSCAN]
+      locations, _ = extract_entities(best_sentence, sample_id)
+      if locations:
+        return locations
+      # If no, then backtrack to next best Sentence-BERT sentence (such as 2nd rank sentence), and repeat step 1 and 2 until run out
+      i += 1
+    # Last resort: LLM (e.g. chatGPT, deepseek, etc.)
+    #if len(locations) == 0:
+    return "No clear location found in top matches"