Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 15 days ago

Commit

8835144

verified ·

1 Parent(s): f3b0e2e

Upload 52 files

Browse files

update 28_7_2025

Files changed (23) hide show

.gitattributes +37 -37
NER/PDF/__pycache__/pdf.cpython-311.pyc +0 -0
NER/PDF/pdf.py +115 -70
NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc +0 -0
NER/WordDoc/wordDoc.py +29 -0
NER/html/__pycache__/extractHTML.cpython-311.pyc +0 -0
NER/html/extractHTML.py +221 -165
NER/word2Vec/__pycache__/word2vec.cpython-311.pyc +0 -0
NER/word2Vec/testModel/test_model.model +2 -2
NER/word2Vec/testModel/test_model_updated.model +2 -2
NER/word2Vec/word2vec.py +436 -369
README.md +73 -67
README_OLD_VERSION.md +74 -0
app.py +792 -696
data_preprocess.py +745 -668
model.py +0 -0
mtdna_backend.py +896 -884
mtdna_classifier.py +713 -706
offer.html +77 -0
pipeline.py +648 -648
requirements.txt +43 -43
smart_fallback.py +204 -155
standardize_location.py +82 -82

.gitattributes CHANGED Viewed

@@ -1,37 +1,37 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-credentials.json filter=crypt diff=crypt

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+credentials.json filter=crypt diff=crypt

NER/PDF/__pycache__/pdf.cpython-311.pyc CHANGED Viewed

Binary files a/NER/PDF/__pycache__/pdf.cpython-311.pyc and b/NER/PDF/__pycache__/pdf.cpython-311.pyc differ

NER/PDF/pdf.py CHANGED Viewed

@@ -6,86 +6,128 @@ from bs4 import BeautifulSoup
 import requests
 from NER import cleanText
 #!pip install tabula-py
 import tabula
-class PDF(): # using PyPDF2
   def __init__(self, pdf, saveFolder, doi=None):
     self.pdf = pdf
     self.doi = doi
     self.saveFolder = saveFolder
   def openPDFFile(self):
     if "https" in self.pdf:
       name = self.pdf.split("/")[-1]
       name = self.downloadPDF(self.saveFolder)
       if name != "no pdfLink to download":
-        fileToOpen = self.saveFolder + "/" + name
-      else: fileToOpen = self.pdf
-    else: fileToOpen = self.pdf
     return open(fileToOpen, "rb")
   def downloadPDF(self, saveFolder):
     pdfLink = ''
-    if ".pdf" not in self.pdf and "https" not in self.pdf: # the download link is a general URL not pdf link
       r = requests.get(self.pdf)
       soup = BeautifulSoup(r.content, 'html.parser')
       links = soup.find_all("a")
       for link in links:
-        if ".pdf" in link.get("href"):
           if self.doi in link.get("href"):
             pdfLink = link.get("href")
             break
     else:
       pdfLink = self.pdf
     if pdfLink != '':
       response = requests.get(pdfLink)
       name = pdfLink.split("/")[-1]
-      pdf = open(saveFolder+"/"+name, 'wb')
-      pdf.write(response.content)
-      pdf.close()
       print("pdf downloaded")
       return name
     else:
       return "no pdfLink to download"
   def extractText(self):
     jsonPage = {}
-    pdf = self.openPDFFile()
-    doc = PDFDocument(pdf)
-    viewer = SimplePDFViewer(pdf)
-    all_pages = [p for p in doc.pages()]
-    cl = cleanText.cleanGenText()
-    for page in range(1,len(all_pages)):
-      viewer.navigate(page)
-      viewer.render()
-      if str(page) not in jsonPage:
-        jsonPage[str(page)] = {}
-      # text
-        text = "".join(viewer.canvas.strings)
-      clean, filteredWord = cl.textPreprocessing(text) #cleanText.cleanGenText(text).cleanText()
-      # save the text of filtered words which remove "a", the, "an", "is", etc.
-      jsonPage[str(page)]["normalText"] = [text]
-      jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
-      #image
-      image = viewer.canvas.images
-      jsonPage[str(page)]["image"] = [image]
-      #form
-      form = viewer.canvas.forms
-      jsonPage[str(page)]["form"] = [form]
-      # content based on PDF adobe
-      content = viewer.canvas.text_content
-      jsonPage[str(page)]["content"] = [content]
-      # inline_image:'''
-      '''Inline images are aligned with the text,
-      and are usually content images like photos, charts, or graphs.'''
-      inline_image = viewer.canvas.inline_images
-      jsonPage[str(page)]["inline_image"] = [inline_image]
-    pdf.close()
-    '''Output Format:
-    jsonPage[str(page)]["normalText"]
-    jsonPage[str(page)]["cleanText"]
-    jsonPage[str(page)]["image"]
-    jsonPage[str(page)]["form"]
-    jsonPage[str(page)]["content"]'''
-    return jsonPage
-  def extractTable(self,pages,saveFile=None,outputFormat=None):
     '''pages (str, int, iterable of int, optional) –
       An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
       Examples: '1-2,3', 'all', [1,2]'''
@@ -106,37 +148,40 @@ class PDF(): # using PyPDF2
       df = []
       print("No tables found in PDF file")
     return df
-  def mergeTextinJson(self,jsonPDF):
-    # pdf
-    #cl = cleanGenText()
     cl = cleanText.cleanGenText()
     pdfText = ""
-    for page in jsonPDF:
-      # page is "\n\n"
-      if len(jsonPDF[page]["normalText"]) > 0:
-        for i in range(len(jsonPDF[page]["normalText"])):
-          text = jsonPDF[page]["normalText"][i]
-          if len(text)>0:
-            text = cl.removeTabWhiteSpaceNewLine(text)
-            text = cl.removeExtraSpaceBetweenWords(text)
-          jsonPDF[page]["normalText"][i] = text
-          # same page is just a dot.
-          if i-1 > 0:
-            if jsonPDF[page]["normalText"][i-1][-1] != ".":
-              pdfText += ". "
-          pdfText += jsonPDF[page]["normalText"][i]
-        if len(jsonPDF[page]["normalText"][i])>0:
-          if jsonPDF[page]["normalText"][i][-1]!=".":
-            pdfText += "."
-          pdfText += "\n\n"
     return pdfText
   def getReference(self):
     pass
   def getSupMaterial(self):
     pass
   def removeHeaders(self):
     pass
   def removeFooters(self):
     pass
   def removeReference(self):
     pass

 import requests
 from NER import cleanText
 #!pip install tabula-py
 import tabula
+import fitz  # PyMuPDF
+import os
+class PDF():
   def __init__(self, pdf, saveFolder, doi=None):
     self.pdf = pdf
     self.doi = doi
     self.saveFolder = saveFolder
   def openPDFFile(self):
     if "https" in self.pdf:
       name = self.pdf.split("/")[-1]
       name = self.downloadPDF(self.saveFolder)
       if name != "no pdfLink to download":
+        fileToOpen = os.path.join(self.saveFolder, name)
+      else:
+        fileToOpen = self.pdf
+    else:
+      fileToOpen = self.pdf
     return open(fileToOpen, "rb")
   def downloadPDF(self, saveFolder):
     pdfLink = ''
+    if ".pdf" not in self.pdf and "https" not in self.pdf:
       r = requests.get(self.pdf)
       soup = BeautifulSoup(r.content, 'html.parser')
       links = soup.find_all("a")
       for link in links:
+        if ".pdf" in link.get("href", ""):
           if self.doi in link.get("href"):
             pdfLink = link.get("href")
             break
     else:
       pdfLink = self.pdf
     if pdfLink != '':
       response = requests.get(pdfLink)
       name = pdfLink.split("/")[-1]
+      print("inside download PDF and name and link are: ", pdfLink, name)
+      print("saveFolder is: ", saveFolder)
+      with open(os.path.join(saveFolder, name), 'wb') as pdf:
+        print("len of response content: ", len(response.content))
+        pdf.write(response.content)
       print("pdf downloaded")
       return name
     else:
       return "no pdfLink to download"
   def extractText(self):
+    fileToOpen = self.openPDFFile().name
+    try:
+      doc = fitz.open(fileToOpen)
+      text = ""
+      for page in doc:
+        text += page.get_text("text") + "\n\n"
+      doc.close()
+      if len(text.strip()) < 100:
+        print("Fallback to PDFReader due to weak text extraction.")
+        text = self.extractTextWithPDFReader()
+      return text
+    except Exception as e:
+      print("Failed with PyMuPDF, fallback to PDFReader:", e)
+      return self.extractTextWithPDFReader()
+  def extract_text_excluding_tables(self):
+    fileToOpen = self.openPDFFile().name
+    text = ""
+    try:
+        doc = fitz.open(fileToOpen)
+        for page in doc:
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if block["type"] == 0:  # text block
+                    lines = block.get("lines", [])
+                    if not lines:
+                        continue
+                    avg_words_per_line = sum(len(l["spans"]) for l in lines) / len(lines)
+                    if avg_words_per_line > 1:  # Heuristic: paragraph-like blocks
+                        for line in lines:
+                            text += " ".join(span["text"] for span in line["spans"]) + "\n"
+        doc.close()
+        if len(text.strip()) < 100:
+          print("Fallback to PDFReader due to weak text extraction.")
+          text = self.extractTextWithPDFReader()
+        return text
+    except Exception as e:
+      print("Failed with PyMuPDF, fallback to PDFReader:", e)
+      return self.extractTextWithPDFReader()
+  def extractTextWithPDFReader(self):
     jsonPage = {}
+    try:
+        pdf = self.openPDFFile()
+        print("open pdf file")
+        print(pdf)
+        doc = PDFDocument(pdf)
+        viewer = SimplePDFViewer(pdf)
+        all_pages = [p for p in doc.pages()]
+        cl = cleanText.cleanGenText()
+        pdfText = ""
+        for page in range(1, len(all_pages)):
+          viewer.navigate(page)
+          viewer.render()
+          if str(page) not in jsonPage:
+            jsonPage[str(page)] = {}
+          text = "".join(viewer.canvas.strings)
+          clean, filteredWord = cl.textPreprocessing(text)
+          jsonPage[str(page)]["normalText"] = [text]
+          jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
+          jsonPage[str(page)]["image"] = [viewer.canvas.images]
+          jsonPage[str(page)]["form"] = [viewer.canvas.forms]
+          jsonPage[str(page)]["content"] = [viewer.canvas.text_content]
+          jsonPage[str(page)]["inline_image"] = [viewer.canvas.inline_images]
+        pdf.close()
+    except:
+        jsonPage = {}
+    return self.mergeTextinJson(jsonPage)
+  def extractTable(self,pages="all",saveFile=None,outputFormat=None):
     '''pages (str, int, iterable of int, optional) –
       An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
       Examples: '1-2,3', 'all', [1,2]'''
       df = []
       print("No tables found in PDF file")
     return df
+  def mergeTextinJson(self, jsonPDF):
     cl = cleanText.cleanGenText()
     pdfText = ""
+    if jsonPDF:
+        for page in jsonPDF:
+          if len(jsonPDF[page]["normalText"]) > 0:
+            for i in range(len(jsonPDF[page]["normalText"])):
+              text = jsonPDF[page]["normalText"][i]
+              if len(text) > 0:
+                text = cl.removeTabWhiteSpaceNewLine(text)
+                text = cl.removeExtraSpaceBetweenWords(text)
+              jsonPDF[page]["normalText"][i] = text
+              if i - 1 > 0:
+                if jsonPDF[page]["normalText"][i - 1][-1] != ".":
+                  pdfText += ". "
+              pdfText += jsonPDF[page]["normalText"][i]
+            if len(jsonPDF[page]["normalText"][i]) > 0:
+              if jsonPDF[page]["normalText"][i][-1] != ".":
+                pdfText += "."
+            pdfText += "\n\n"
     return pdfText
   def getReference(self):
     pass
   def getSupMaterial(self):
     pass
   def removeHeaders(self):
     pass
   def removeFooters(self):
     pass
   def removeReference(self):
     pass

NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc CHANGED Viewed

Binary files a/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc and b/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc differ

NER/WordDoc/wordDoc.py CHANGED Viewed

@@ -81,6 +81,35 @@ class wordDoc(): # using python-docx
                   tableData += '\n'
               json["Section" + str(s)]["Table"+str(i)] = tableData
     return json
   def extractTableAsExcel(self):
     getDoc = ''
     try:

                   tableData += '\n'
               json["Section" + str(s)]["Table"+str(i)] = tableData
     return json
+  def extractTableAsList(self):
+    tables = []
+    try:
+      doc = Document()
+      doc.LoadFromFile(self.wordDoc)
+    except:
+      response = requests.get(self.wordDoc)
+      name = self.wordDoc.split("/")[-1]
+      with open(os.path.join(self.saveFolder, name), "wb") as f:
+        f.write(response.content)
+      doc = Document()
+      doc.LoadFromFile(os.path.join(self.saveFolder, name))
+    for s in range(doc.Sections.Count):
+      section = doc.Sections.get_Item(s)
+      for i in range(section.Tables.Count):
+        table = section.Tables.get_Item(i)
+        table_data = []
+        for row in range(table.Rows.Count):
+          row_data = []
+          for cell in range(table.Rows.get_Item(row).Cells.Count):
+            cell_obj = table.Rows.get_Item(row).Cells.get_Item(cell)
+            cell_text = ""
+            for p in range(cell_obj.Paragraphs.Count):
+              cell_text += cell_obj.Paragraphs.get_Item(p).Text.strip() + " "
+            row_data.append(cell_text.strip())
+          table_data.append(row_data)
+        tables.append(table_data)
+    return tables
   def extractTableAsExcel(self):
     getDoc = ''
     try:

NER/html/__pycache__/extractHTML.cpython-311.pyc CHANGED Viewed

Binary files a/NER/html/__pycache__/extractHTML.cpython-311.pyc and b/NER/html/__pycache__/extractHTML.cpython-311.pyc differ

NER/html/extractHTML.py CHANGED Viewed

@@ -1,166 +1,222 @@
-#!pip install bs4
-# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
-from bs4 import BeautifulSoup
-import requests
-from DefaultPackages import openFile, saveFile
-from NER import cleanText
-import pandas as pd
-import os
-class HTML():
-  def __init__(self, htmlFile, htmlLink):
-    self.htmlLink = htmlLink
-    self.htmlFile = htmlFile
-  def openHTMLFile(self):
-    try:
-        if self.htmlLink != "None":
-          r = requests.get(self.htmlLink)
-          soup = BeautifulSoup(r.content, 'html.parser')
-        else:
-          with open(self.htmlFile) as fp:
-            soup = BeautifulSoup(fp, 'html.parser')
-    except:
-        print("cannot open html file")
-        soup = None
-    return soup
-  def getText(self):
-    soup = self.openHTMLFile()
-    text =  ""
-    if soup:
-        s = soup.find_all("html")
-        for t in range(len(s)):
-          text = s[t].get_text()
-        cl = cleanText.cleanGenText()
-        text = cl.removeExtraSpaceBetweenWords(text)
-    return text
-  def getListSection(self, scienceDirect=None):
-    json = {}
-    text = ""
-    textJson, textHTML = "",""
-    if scienceDirect == None:
-      soup = self.openHTMLFile()
-      # get list of section
-      json = {}
-      if soup:
-          for h2Pos in range(len(soup.find_all('h2'))):
-            if soup.find_all('h2')[h2Pos].text not in json:
-              json[soup.find_all('h2')[h2Pos].text] = []
-            if h2Pos + 1 < len(soup.find_all('h2')):
-              content = soup.find_all('h2')[h2Pos].find_next("p")
-              nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
-              while content.text != nexth2Content.text:
-                json[soup.find_all('h2')[h2Pos].text].append(content.text)
-                content = content.find_next("p")
-            else:
-              content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
-              json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
-          # format
-          '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
-            'Results':[], 'Discussion':[], 'References':[],
-            'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
-            'Additional information':[], 'Electronic supplementary material':[],
-            'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
-    if scienceDirect!= None or len(json)==0:
-      # Replace with your actual Elsevier API key
-      api_key = os.environ["SCIENCE_DIRECT_API"]
-      # ScienceDirect article DOI or PI (Example DOI)
-      doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
-      # Base URL for the Elsevier API
-      base_url = "https://api.elsevier.com/content/article/doi/"
-      # Set headers with API key
-      headers = {
-          "Accept": "application/json",
-          "X-ELS-APIKey": api_key
-      }
-      # Make the API request
-      response = requests.get(base_url + doi, headers=headers)
-# Check if the request was successful
-      if response.status_code == 200:
-        data = response.json()
-        supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
-        if "originalText" in list(supp_data.keys()):
-          if type(supp_data["originalText"])==str:
-            json["originalText"] = [supp_data["originalText"]]
-          if type(supp_data["originalText"])==dict:
-            json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
-        else:
-          if type(supp_data)==dict:
-            for key in supp_data:
-              json[key] = [supp_data[key]]
-    if json:
-        textJson = self.mergeTextInJson(json)
-    textHTML = self.getText()
-    if len(textHTML) > len(textJson):
-      text = textHTML
-    else: text = textJson
-    return text #json
-  def getReference(self):
-    # get reference to collect more next data
-    ref = []
-    json = self.getListSection()
-    for key in json["References"]:
-      ct = cleanText.cleanGenText(key)
-      cleanText, filteredWord = ct.cleanText()
-      if cleanText not in ref:
-        ref.append(cleanText)
-    return ref
-  def getSupMaterial(self):
-    # check if there is material or not
-    json = {}
-    soup = self.openHTMLFile()
-    for h2Pos in range(len(soup.find_all('h2'))):
-      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
-        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
-        link, output = [],[]
-        if soup.find_all('h2')[h2Pos].text not in json:
-          json[soup.find_all('h2')[h2Pos].text] = []
-        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
-            link.append(l["href"])
-        if h2Pos + 1 < len(soup.find_all('h2')):
-          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
-          if nexth2Link in link:
-            link = link[:link.index(nexth2Link)]
-        # only take links having "https" in that
-        for i in link:
-          if "https" in i:  output.append(i)
-        json[soup.find_all('h2')[h2Pos].text].extend(output)
-    return json
-  def extractTable(self):
-    soup = self.openHTMLFile()
-    df = []
-    try:
-      df = pd.read_html(str(soup))
-    except ValueError:
-      df = []
-      print("No tables found in HTML file")
-    return df
-  def mergeTextInJson(self,jsonHTML):
-    cl = cleanText.cleanGenText()
-    #cl = cleanGenText()
-    htmlText = ""
-    for sec in jsonHTML:
-      # section is "\n\n"
-      if len(jsonHTML[sec]) > 0:
-        for i in range(len(jsonHTML[sec])):
-          # same section is just a dot.
-          text = jsonHTML[sec][i]
-          if len(text)>0:
-            #text = cl.removeTabWhiteSpaceNewLine(text)
-            #text = cl.removeExtraSpaceBetweenWords(text)
-            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
-          jsonHTML[sec][i] = text
-          if i-1 >= 0:
-            if len(jsonHTML[sec][i-1])>0:
-              if jsonHTML[sec][i-1][-1] != ".":
-                htmlText += ". "
-          htmlText += jsonHTML[sec][i]
-        if len(jsonHTML[sec][i]) > 0:
-          if jsonHTML[sec][i][-1]!=".":
-            htmlText += "."
-        htmlText += "\n\n"
-    return htmlText
-  def removeHeaders(self):
-    pass
-  def removeFooters(self):
-    pass
-  def removeReferences(self):
     pass

+# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
+from bs4 import BeautifulSoup
+import requests
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+import pandas as pd
+class HTML():
+  def __init__(self, htmlFile, htmlLink):
+    self.htmlLink = htmlLink
+    self.htmlFile = htmlFile
+  # def openHTMLFile(self):
+  #   headers = {
+  #       "User-Agent": (
+  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
+  #           "Chrome/114.0.0.0 Safari/537.36"
+  #       ),
+  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+  #       "Referer": self.htmlLink,
+  #       "Connection": "keep-alive"
+  #   }
+  #   session = requests.Session()
+  #   session.headers.update(headers)
+  #   if self.htmlLink != "None":
+  #       try:
+  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
+  #           if r.status_code != 200:
+  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
+  #               return BeautifulSoup("", 'html.parser')
+  #           soup = BeautifulSoup(r.content, 'html.parser')
+  #       except Exception as e:
+  #           print(f"❌ Exception fetching HTML: {e}")
+  #           return BeautifulSoup("", 'html.parser')
+  #   else:
+  #       with open(self.htmlFile) as fp:
+  #           soup = BeautifulSoup(fp, 'html.parser')
+  #   return soup
+  from lxml.etree import ParserError, XMLSyntaxError
+  def openHTMLFile(self):
+      not_need_domain = ['https://broadinstitute.github.io/picard/',
+              'https://software.broadinstitute.org/gatk/best-practices/',
+              'https://www.ncbi.nlm.nih.gov/genbank/',
+              'https://www.mitomap.org/']
+      headers = {
+          "User-Agent": (
+              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+              "AppleWebKit/537.36 (KHTML, like Gecko) "
+              "Chrome/114.0.0.0 Safari/537.36"
+          ),
+          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+          "Referer": self.htmlLink,
+          "Connection": "keep-alive"
+      }
+      session = requests.Session()
+      session.headers.update(headers)
+      if self.htmlLink in not_need_domain:
+        return BeautifulSoup("", 'html.parser')
+      try:
+          if self.htmlLink and self.htmlLink != "None":
+              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
+              if r.status_code != 200 or not r.text.strip():
+                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
+                  return BeautifulSoup("", 'html.parser')
+              soup = BeautifulSoup(r.content, 'html.parser')
+          else:
+              with open(self.htmlFile, encoding='utf-8') as fp:
+                  soup = BeautifulSoup(fp, 'html.parser')
+      except (ParserError, XMLSyntaxError, OSError) as e:
+          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
+          return BeautifulSoup("", 'html.parser')
+      except Exception as e:
+          print(f"❌ General exception for {self.htmlLink}: {e}")
+          return BeautifulSoup("", 'html.parser')
+      return soup
+  def getText(self):
+    soup = self.openHTMLFile()
+    s = soup.find_all("html")
+    text = ""
+    if s:
+      for t in range(len(s)):
+        text = s[t].get_text()
+    cl = cleanText.cleanGenText()
+    text = cl.removeExtraSpaceBetweenWords(text)
+    return text
+  def getListSection(self, scienceDirect=None):
+    json = {}
+    text = ""
+    textJson, textHTML = "",""
+    if scienceDirect == None:
+      soup = self.openHTMLFile()
+      # get list of section
+      json = {}
+      for h2Pos in range(len(soup.find_all('h2'))):
+        if soup.find_all('h2')[h2Pos].text not in json:
+          json[soup.find_all('h2')[h2Pos].text] = []
+        if h2Pos + 1 < len(soup.find_all('h2')):
+          content = soup.find_all('h2')[h2Pos].find_next("p")
+          nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
+          while content.text != nexth2Content.text:
+            json[soup.find_all('h2')[h2Pos].text].append(content.text)
+            content = content.find_next("p")
+        else:
+          content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
+          json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
+      # format
+      '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
+        'Results':[], 'Discussion':[], 'References':[],
+        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
+        'Additional information':[], 'Electronic supplementary material':[],
+        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
+    if scienceDirect!= None or len(json)==0:
+      # Replace with your actual Elsevier API key
+      api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
+      # ScienceDirect article DOI or PI (Example DOI)
+      doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
+      # Base URL for the Elsevier API
+      base_url = "https://api.elsevier.com/content/article/doi/"
+      # Set headers with API key
+      headers = {
+          "Accept": "application/json",
+          "X-ELS-APIKey": api_key
+      }
+      # Make the API request
+      response = requests.get(base_url + doi, headers=headers)
+# Check if the request was successful
+      if response.status_code == 200:
+        data = response.json()
+        supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
+        if "originalText" in list(supp_data.keys()):
+          if type(supp_data["originalText"])==str:
+            json["originalText"] = [supp_data["originalText"]]
+          if type(supp_data["originalText"])==dict:
+            json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
+        else:
+          if type(supp_data)==dict:
+            for key in supp_data:
+              json[key] = [supp_data[key]]
+    textJson = self.mergeTextInJson(json)
+    textHTML = self.getText()
+    if len(textHTML) > len(textJson):
+      text = textHTML
+    else: text = textJson
+    return text #json
+  def getReference(self):
+    # get reference to collect more next data
+    ref = []
+    json = self.getListSection()
+    for key in json["References"]:
+      ct = cleanText.cleanGenText(key)
+      cleanText, filteredWord = ct.cleanText()
+      if cleanText not in ref:
+        ref.append(cleanText)
+    return ref
+  def getSupMaterial(self):
+    # check if there is material or not
+    json = {}
+    soup = self.openHTMLFile()
+    for h2Pos in range(len(soup.find_all('h2'))):
+      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
+        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
+        link, output = [],[]
+        if soup.find_all('h2')[h2Pos].text not in json:
+          json[soup.find_all('h2')[h2Pos].text] = []
+        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
+            link.append(l["href"])
+        if h2Pos + 1 < len(soup.find_all('h2')):
+          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
+          if nexth2Link in link:
+            link = link[:link.index(nexth2Link)]
+        # only take links having "https" in that
+        for i in link:
+          if "https" in i:  output.append(i)
+        json[soup.find_all('h2')[h2Pos].text].extend(output)
+    return json
+  def extractTable(self):
+    soup = self.openHTMLFile()
+    df = []
+    if len(soup)>0:
+      try:
+        df = pd.read_html(str(soup))
+      except ValueError:
+        df = []
+        print("No tables found in HTML file")
+    return df
+  def mergeTextInJson(self,jsonHTML):
+    cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    htmlText = ""
+    for sec in jsonHTML:
+      # section is "\n\n"
+      if len(jsonHTML[sec]) > 0:
+        for i in range(len(jsonHTML[sec])):
+          # same section is just a dot.
+          text = jsonHTML[sec][i]
+          if len(text)>0:
+            #text = cl.removeTabWhiteSpaceNewLine(text)
+            #text = cl.removeExtraSpaceBetweenWords(text)
+            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
+          jsonHTML[sec][i] = text
+          if i-1 >= 0:
+            if len(jsonHTML[sec][i-1])>0:
+              if jsonHTML[sec][i-1][-1] != ".":
+                htmlText += ". "
+          htmlText += jsonHTML[sec][i]
+        if len(jsonHTML[sec][i]) > 0:
+          if jsonHTML[sec][i][-1]!=".":
+            htmlText += "."
+        htmlText += "\n\n"
+    return htmlText
+  def removeHeaders(self):
+    pass
+  def removeFooters(self):
+    pass
+  def removeReferences(self):
     pass

NER/word2Vec/__pycache__/word2vec.cpython-311.pyc CHANGED Viewed

Binary files a/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc and b/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc differ

NER/word2Vec/testModel/test_model.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:734185116a1d2099dba0d04efc0eb1b7e0e8213fe1259b57bbcb7aaac3cd46ea
-size 133

 version https://git-lfs.github.com/spec/v1
+oid sha256:193f58915e5f895c3d00d1012a691b15ad051d2b9eaf83662a2a7e3af326e923
+size 25214

NER/word2Vec/testModel/test_model_updated.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1b1b785c79991b857b364ee9863985eaf845087efb1aa40a6b9cfae3b2a50012
-size 133

 version https://git-lfs.github.com/spec/v1
+oid sha256:78fc82c69afe5c74ae631f389025e5195ad613b62a7c5e42fb7f3f0b7cda99c3
+size 30688

NER/word2Vec/word2vec.py CHANGED Viewed

@@ -1,369 +1,436 @@
-'''WORD TO VECTOR'''
-import pandas as pd
-import json
-import gensim
-import spacy
-from DefaultPackages import openFile, saveFile
-from NER import cleanText
-from gensim.models.keyedvectors import KeyedVectors
-from gensim.test.utils import common_texts
-from gensim.models.word2vec import Word2Vec
-from gensim.scripts.glove2word2vec import glove2word2vec
-from gensim.test.utils import datapath, get_tmpfile
-import sys
-import subprocess
-# can try multiprocessing to run quicker
-import multiprocessing
-import copy
-sys.setrecursionlimit(1000)
-# creat folder word2Vec
-#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
-# create word2vec model
-#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
-'''Some notes for this model
-sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
-a similar word to the word we are finding, so can we try to preprocess text so that
-we make the corpus more effective and only contains the important words. Then when we
-train the model, the important words will be seen as important. Or
-when we already have the similar list of words, we can remove the words in there
-that are stopwords/unnecessary words.'''
-### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
-class word2Vec():
-  def __init__(self, nameFile=None, modelName=None):
-    self.nameFile = nameFile
-    self.modelName = modelName
-  def spacy_similarity(self, word):
-    # when use word2vec, try medium or large is better
-    # maybe try odc similarity?
-    nlp = spacy.load("en_core_web_lg")
-    doc = nlp(word)
-    for token1 in doc:
-      for token2 in doc:
-        print(token1.text, token2.text, token1.similarity(token2))
-    pass
-  # clean text before transform to corpus
-  def cleanTextBeforeCorpus(self,oriText, doi=None):
-    cl = cleanText.cleanGenText()
-    #cl = cleanGenText()
-    output = ""
-    alreadyRemoveDoi = False
-    for word in oriText.split(" "):
-      # remove DOI
-      if doi != None and doi in oriText:
-        if alreadyRemoveDoi == False:
-          newWord = cl.removeDOI(word,doi)
-          if len(newWord) > 0 and newWord != word:
-            alreadyRemoveDoi = True
-            word = newWord
-      # remove punctuation
-      # split the sticked words
-      #word = cl.splitStickWords(word)
-      # remove punctuation
-      word = cl.removePunct(word,True)
-      # remove URL
-      word = cl.removeURL(word)
-      # remove HTMLTag
-      word = cl.removeHTMLTag(word)
-      # remove tab, white space, newline
-      word = cl.removeTabWhiteSpaceNewLine(word)
-      # optional: remove stopwords
-      #word = cl.removeStopWords(word)
-      if len(word)>0:
-        output += word + " "
-    return output
-  def cleanAllTextBeforeCorpus(self, allText, doi=None):
-    cleanOutput = ""
-    remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
-    if len(allText) > 0:
-      corpusText = allText
-      for pos in range(len(corpusText.split("\n\n"))):
-        if len(corpusText.split("\n\n")[pos]) > 0:
-          lines = corpusText.split("\n\n")[pos]
-          for line in lines.split("\n"):
-            if remove in line:  line = line.replace(remove, "")
-            clean_text = self.cleanTextBeforeCorpus(line, doi)
-            cleanOutput += clean_text + "\n"
-          cleanOutput += "\n\n"
-    return cleanOutput
-  def tableTransformToCorpusText(self, df, excelFile=None):
-    # PDF, Excel, WordDoc
-    #cl = cleanText.cleanGenText()
-    corpus = {}
-      # PDF or df
-    if excelFile == None:
-      if len(df) > 0:
-        try:
-          for i in range(len(df)):
-            # each new dimension/page is considered to be a sentence which ends with the period.
-            # each new line is a new list, and each new df is a new corpus
-            outputDF = []
-            text = df[i].values.tolist()
-            if len(text) > 0:
-              outputRowDF = self.helperRowTableToCorpus(text)
-              #outputColDF = self.helperColTableToCorpus(text)
-              outputDF.extend(outputRowDF)
-              #outputDF.extend(outputColDF)
-            if len(outputDF) > 0:
-              corpus["corpus" + str(i)] = outputDF
-        except:
-          outputDF = []
-          text = df.values.tolist()
-          if len(text) > 0:
-            outputRowDF = self.helperRowTableToCorpus(text)
-            #outputColDF = self.helperColTableToCorpus(text)
-            outputDF.extend(outputRowDF)
-            #outputDF.extend(outputColDF)
-          if len(outputDF) > 0:
-            corpus["corpus0"] = outputDF
-    else:
-      try:
-          df = pd.ExcelFile(excelFile)
-      except:
-          if filepath.endswith('.xls'):
-            df = pd.read_excel(filepath, engine='xlrd')
-          else:
-            df = pd.read_excel(filepath, engine='openpyxl')
-      sheetNames = df.sheet_names
-      output = []
-      if len(sheetNames) > 0:
-        for s in range(len(sheetNames)):
-          outputDF = []
-          with pd.ExcelFile(excelFile) as xls:
-            data = pd.read_excel(xls, sheetNames[s])
-          if sheetNames[s] != 'Evaluation Warning':
-            text = data.values.tolist()
-            if len(text) > 0:
-              outputRowDF = self.helperRowTableToCorpus(text)
-              #outputColDF = self.helperColTableToCorpus(text)
-              outputDF.extend(outputRowDF)
-              #outputDF.extend(outputColDF)
-          if len(outputDF) > 0:
-            corpus["corpus" + str(s)] = outputDF
-    return corpus
-  def helperRowTableToCorpus(self, textList):
-    #cl = cleanGenText()
-    cl = cleanText.cleanGenText()
-    stopWords = ["NaN","Unnamed:","nan"]
-    outputDF = []
-    for line in textList:
-      outputLine = []
-      for words in line:
-        words = str(words)
-        if len(words) > 0:
-          for word in words.split(" "):
-            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
-            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
-              #word = cl.splitStickWords(word)
-              word = cl.removePunct(word)
-              word = " ".join(cl.removeStopWords(word))
-              word = cl.removeTabWhiteSpaceNewLine(word)
-              if len(word) > 1:
-                if len(word.split(" ")) > 1:
-                  for x in word.split(" "):
-                    if len(x) > 1 and x.isnumeric()==False:
-                      outputLine.append(x.lower())
-                else:
-                  if word.isnumeric() == False:
-                    outputLine.append(word.lower())
-      if len(outputLine) > 0:
-        outputDF.append(outputLine)
-    return outputDF
-  def helperColTableToCorpus(self, dfList):
-    #cl = cleanGenText()
-    cl = cleanText.cleanGenText()
-    stopWords = ["NaN","Unnamed:","nan"]
-    outputDF = []
-    # use the first length line as the column ref
-    for pos in range(len(dfList[0])):
-      outputLine = []
-      for line in dfList:
-        if pos < len(line):
-          words = line[pos]
-          words = str(words)
-        else: words = ""
-        if len(words) > 0:
-          for word in words.split(" "):
-            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
-            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
-              #word = cl.splitStickWords(word)
-              word = cl.removePunct(word)
-              word = " ".join(cl.removeStopWords(word))
-              word = cl.removeTabWhiteSpaceNewLine(word)
-              if len(word) > 1:
-                if len(word.split(" ")) > 1:
-                  for x in word.split(" "):
-                    if len(x) > 1 and x.isnumeric()==False:
-                      outputLine.append(x.lower())
-                else:
-                  if word.isnumeric() == False:
-                    outputLine.append(word.lower())
-      if len(outputLine) > 0:
-        outputDF.append(outputLine)
-    return outputDF
-  # create a corpus
-  def createCorpusText(self, corpusText):
-    '''ex: "Tom is cat. Jerry is mouse."
-    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
-    # the output should be like this:
-    '''texts = {
-      "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
-      "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
-    }
-    '''
-    # separate paragraph
-    '''Ex: Cat is an animal. Tom is cat.
-    Mouse is an animal.
-    Jerry is mouse.'''
-    texts = {}
-    cl = cleanText.cleanGenText()
-    #cl = cleanGenText()
-    for pos in range(len(corpusText.split("\n\n"))):
-      if len(corpusText.split("\n\n")[pos]) > 0:
-        texts["Paragraph "+str(pos)] = []
-        lines = corpusText.split("\n\n")[pos]
-        for line in lines.split("\n"):
-          for l in line.split("."):
-            if len(l) > 0:
-              cl.removeTabWhiteSpaceNewLine(l)
-              l = l.lower()
-              newL = []
-              for word in l.split(" "):
-                if len(word) > 0:
-                  word = cl.removeStopWords(word)
-                  for w in word:
-                    if len(w) > 0 and w.isnumeric()==False:
-                      newL.append(w)
-              if len(newL)>0:
-                texts["Paragraph "+str(pos)].append(newL)
-        if len(texts["Paragraph "+str(pos)]) == 0:
-          del texts["Paragraph "+str(pos)]
-    return texts
-  def selectParaForWC(self,corpus):
-    ''' corpus should be in the format:
-    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
-    corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
-    corSize = len(corpus)
-    # less than 2000
-    if 0 < corSize < 2000:
-      window=3.5
-      vector_size=75
-      sample=1e-3
-      negative=10
-      epochs=10
-      sg=1
-    # 2000 - 100000
-    elif 2000 <= corSize < 100000:
-      window=3.5
-      vector_size=75
-      sample=1e-5
-      negative=10
-      epochs=10
-      sg=1
-    elif 100000 <=corSize < 1000000:
-      window=7.5
-      vector_size=150
-      sample=1e-5
-      negative=10
-      epochs=6
-      sg=0
-    return window, vector_size, sample, negative, epochs, sg
-  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
-                    vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
-    # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
-    jsonFile = ""
-    jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
-    cores = multiprocessing.cpu_count()
-    combinedCorpus = []
-    window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
-    if len(jsonFile) > 0:
-      for key in jsonFile:
-        combinedCorpus.extend(jsonFile[key])
-      window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
-      # # min_count=1 ensures all words are included
-      '''w2vModel = Word2Vec(
-                          min_count=1,
-                          window=window,
-                          vector_size=vector_size,
-                          sample=sample,
-                          alpha=0.03,
-                          min_alpha=0.0007,
-                          negative=negative,
-                          workers=cores-1,
-                          epochs = epochs,
-                          sg=sg)'''
-      #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
-      accept = False
-      while not accept:
-        if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
-          try:
-            w2vModel = Word2Vec(
-                            min_count=1,
-                            window=window,
-                            vector_size=vector_size,
-                            sample=sample,
-                            alpha=0.03,
-                            min_alpha=0.0007,
-                            negative=negative,
-                            workers=cores-1,
-                            epochs = epochs,
-                            sg=sg)
-            w2vModel.build_vocab(combinedCorpus)
-            w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
-            accept = True
-          except:
-            for key in jsonFile:
-              combinedCorpus.extend(jsonFile[key])
-            window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
-            print("next is " + str(len(combinedCorpus)))
-        else:
-          print("no parameter to train")
-          break
-      #w2vModel.build_vocab(combinedCorpus)
-      #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
-      #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
-      #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
-      w2vModel.save(saveFolder+"/"+modelName+".model")
-      w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
-      print("done w2v")
-    else: print("no corpus to train")
-    #return combinedCorpus
-  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
-    # might not be a meaningful keyword
-    #stopWords = ["show"]
-    # same word but just plural nouns, tense
-    simWords = [word+"s",word+"es",word+"ing",word+"ed"]
-    model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
-    results = model.most_similar(positive=[word],topn=n)
-    #removeIndex = []
-    #currN = copy.deepcopy(n)
-    '''for r in range(len(results)):
-      if len(results[r][0]) < 2:
-        removeIndex.append(results[r])
-      # remove the same word but just plural and singular noun and lower than the cos_thres
-      elif results[r][0] == word:
-        removeIndex.append(results[r])
-      elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
-        removeIndex.append(results[r])
-    for rem in removeIndex:
-      results.remove(rem)
-    while len(results)!=n and len(results) != 0:
-      moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
-      if moreNewResult not in results and len(moreNewResult[0])>1:
-        if moreNewResult[0] not in stopWords and results[0] != word:
-          results.append(moreNewResult)
-      currN +=1'''
-    return results
-  # adding our model into spacy
-  # this deals with command line; but instead of using it, we write python script to run command line
-  def loadWordVec(self,modelName,wordVec):
-    # modelName is the name you want to save into spacy
-    # wordVec is the trained word2vec in txt format
-    subprocess.run([sys.executable,
-                    "-m",
-                    "spacy",
-                    "init-model",
-                    "en",
-                    modelName, # this modelName comes from the saved modelName of function trainWord2Vec
-                    "--vectors-loc",
-                    wordVec])

+'''WORD TO VECTOR'''
+import pandas as pd
+import json
+import gensim
+import spacy
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.test.utils import common_texts
+from gensim.models.word2vec import Word2Vec
+from gensim.scripts.glove2word2vec import glove2word2vec
+from gensim.test.utils import datapath, get_tmpfile
+from gensim.models import Phrases
+from gensim.models.phrases import Phraser
+import sys
+import subprocess
+import os
+# can try multiprocessing to run quicker
+import multiprocessing
+import copy
+sys.setrecursionlimit(1000)
+# creat folder word2Vec
+#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
+# create word2vec model
+#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
+'''Some notes for this model
+sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
+a similar word to the word we are finding, so can we try to preprocess text so that
+we make the corpus more effective and only contains the important words. Then when we
+train the model, the important words will be seen as important. Or
+when we already have the similar list of words, we can remove the words in there
+that are stopwords/unnecessary words.'''
+### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
+class word2Vec():
+  def __init__(self, nameFile=None, modelName=None):
+    self.nameFile = nameFile
+    self.modelName = modelName
+    #self.nlp = spacy.load("en_core_web_lg")
+    self.cl = cleanText.cleanGenText()
+  def spacy_similarity(self, word):
+    # when use word2vec, try medium or large is better
+    # maybe try odc similarity?
+    doc = self.nlp(word)
+    for token1 in doc:
+      for token2 in doc:
+        print(token1.text, token2.text, token1.similarity(token2))
+    pass
+  # clean text before transform to corpus
+  def cleanTextBeforeCorpus(self,oriText, doi=None):
+    #cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    output = ""
+    alreadyRemoveDoi = False
+    for word in oriText.split(" "):
+      # remove DOI
+      if doi != None and doi in oriText:
+        if alreadyRemoveDoi == False:
+          newWord = self.cl.removeDOI(word,doi)
+          if len(newWord) > 0 and newWord != word:
+            alreadyRemoveDoi = True
+            word = newWord
+      # remove punctuation
+      # split the sticked words
+      #word = cl.splitStickWords(word)
+      # remove punctuation
+      word = self.cl.removePunct(word,True)
+      # remove URL
+      word = self.cl.removeURL(word)
+      # remove HTMLTag
+      word = self.cl.removeHTMLTag(word)
+      # remove tab, white space, newline
+      word = self.cl.removeTabWhiteSpaceNewLine(word)
+      # optional: remove stopwords
+      #word = cl.removeStopWords(word)
+      if len(word)>0:
+        output += word + " "
+    return output
+  def cleanAllTextBeforeCorpus(self, allText, doi=None):
+    cleanOutput = ""
+    remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
+    if len(allText) > 0:
+      corpusText = allText.split("\n\n")
+      for pos in range(len(corpusText)):
+        lines = corpusText[pos]
+        if len(lines) > 0:
+          for line in lines.split("\n"):
+            if remove in line:  line = line.replace(remove, "")
+            clean_text = self.cleanTextBeforeCorpus(line, doi)
+            cleanOutput += clean_text + "\n"
+          cleanOutput += "\n\n"
+    return cleanOutput
+  import urllib.parse, requests
+  def tableTransformToCorpusText(self, df, excelFile=None):
+    # PDF, Excel, WordDoc
+    #cl = cleanText.cleanGenText()
+    corpus = {}
+      # PDF or df
+    if excelFile == None:
+      if len(df) > 0:
+        try:
+          for i in range(len(df)):
+            # each new dimension/page is considered to be a sentence which ends with the period.
+            # each new line is a new list, and each new df is a new corpus
+            outputDF = []
+            text = df[i].values.tolist()
+            if len(text) > 0:
+              outputRowDF = self.helperRowTableToCorpus(text)
+              #outputColDF = self.helperColTableToCorpus(text)
+              outputDF.extend(outputRowDF)
+              #outputDF.extend(outputColDF)
+            if len(outputDF) > 0:
+              corpus["corpus" + str(i)] = outputDF
+        except:
+          outputDF = []
+          text = df.values.tolist()
+          if len(text) > 0:
+            outputRowDF = self.helperRowTableToCorpus(text)
+            #outputColDF = self.helperColTableToCorpus(text)
+            outputDF.extend(outputRowDF)
+            #outputDF.extend(outputColDF)
+          if len(outputDF) > 0:
+            corpus["corpus0"] = outputDF
+    else:
+      try:
+          df = pd.ExcelFile(excelFile)
+      except:
+          if excelFile.endswith('.xls'):
+            df = pd.read_excel(excelFile, engine='xlrd')
+          else:
+            df = pd.read_excel(excelFile, engine='openpyxl')
+      sheetNames = df.sheet_names
+      output = []
+      if len(sheetNames) > 0:
+        for s in range(len(sheetNames)):
+          outputDF = []
+          with pd.ExcelFile(excelFile) as xls:
+            data = pd.read_excel(xls, sheetNames[s])
+          if sheetNames[s] != 'Evaluation Warning':
+            text = data.values.tolist()
+            if len(text) > 0:
+              outputRowDF = self.helperRowTableToCorpus(text)
+              #outputColDF = self.helperColTableToCorpus(text)
+              outputDF.extend(outputRowDF)
+              #outputDF.extend(outputColDF)
+          if len(outputDF) > 0:
+            corpus["corpus" + str(s)] = outputDF
+    return corpus
+  def helperRowTableToCorpus(self, textList):
+    #cl = cleanGenText()
+    #cl = cleanText.cleanGenText()
+    stopWords = ["NaN","Unnamed:","nan"]
+    outputDF = []
+    for line in textList:
+      outputLine = []
+      for words in line:
+        words = str(words)
+        if len(words) > 0:
+          for word in words.split(" "):
+            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
+            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
+              #word = cl.splitStickWords(word)
+              word = self.cl.removePunct(word)
+              word = " ".join(self.cl.removeStopWords(word))
+              word = self.cl.removeTabWhiteSpaceNewLine(word)
+              if len(word) > 1:
+                if len(word.split(" ")) > 1:
+                  for x in word.split(" "):
+                    if len(x) > 1 and x.isnumeric()==False:
+                      outputLine.append(x.lower())
+                else:
+                  if word.isnumeric() == False:
+                    outputLine.append(word.lower())
+      if len(outputLine) > 0:
+        outputDF.append(outputLine)
+    return outputDF
+  def helperColTableToCorpus(self, dfList):
+    #cl = cleanGenText()
+    #cl = cleanText.cleanGenText()
+    stopWords = ["NaN","Unnamed:","nan"]
+    outputDF = []
+    # use the first length line as the column ref
+    for pos in range(len(dfList[0])):
+      outputLine = []
+      for line in dfList:
+        if pos < len(line):
+          words = line[pos]
+          words = str(words)
+        else: words = ""
+        if len(words) > 0:
+          for word in words.split(" "):
+            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
+            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
+              #word = cl.splitStickWords(word)
+              word = self.cl.removePunct(word)
+              word = " ".join(self.cl.removeStopWords(word))
+              word = self.cl.removeTabWhiteSpaceNewLine(word)
+              if len(word) > 1:
+                if len(word.split(" ")) > 1:
+                  for x in word.split(" "):
+                    if len(x) > 1 and x.isnumeric()==False:
+                      outputLine.append(x.lower())
+                else:
+                  if word.isnumeric() == False:
+                    outputLine.append(word.lower())
+      if len(outputLine) > 0:
+        outputDF.append(outputLine)
+    return outputDF
+  # create a corpus
+  def createCorpusText(self, corpusText):
+    '''ex: "Tom is cat. Jerry is mouse."
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
+    # the output should be like this:
+    '''texts = {
+      "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
+      "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
+    }
+    '''
+    # separate paragraph
+    '''Ex: Cat is an animal. Tom is cat.
+    Mouse is an animal.
+    Jerry is mouse.'''
+    texts = {}
+    #cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    corpus = corpusText.split("\n\n")
+    for pos in range(len(corpus)):
+      if len(corpus[pos]) > 0:
+        texts["Paragraph "+str(pos)] = []
+        lines = corpus[pos]
+        for line in lines.split("\n"):
+          for l in line.split("."):
+            if len(l) > 0:
+              l = self.cl.removeTabWhiteSpaceNewLine(l)
+              l = l.lower()
+              newL = []
+              for word in l.split(" "):
+                if len(word) > 0:
+                  word = self.cl.removeStopWords(word)
+                  for w in word:
+                    if len(w) > 0 and w.isnumeric()==False:
+                      newL.append(w)
+              if len(newL)>0:
+                texts["Paragraph "+str(pos)].append(newL)
+        if len(texts["Paragraph "+str(pos)]) == 0:
+          del texts["Paragraph "+str(pos)]
+    return texts
+  def selectParaForWC(self, corpus):
+    """
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
+    Heuristically determine Word2Vec parameters.
+    """
+    corSize = len(corpus)
+    if corSize == 0:
+        return None, None, None, None, None, None
+    # Adjust parameters based on corpus size
+    if corSize < 2000:
+        # Small corpus — need high generalization
+        window = 3
+        vector_size = 100
+        sample = 1e-3
+        negative = 5
+        epochs = 20
+        sg = 1  # Skip-gram preferred for rare words
+    elif corSize < 10000:
+        window = 5
+        vector_size = 150
+        sample = 1e-4
+        negative = 10
+        epochs = 20
+        sg = 1
+    elif corSize < 100000:
+        window = 7
+        vector_size = 200
+        sample = 1e-5
+        negative = 15
+        epochs = 15
+        sg = 1
+    elif corSize < 500000:
+        window = 10
+        vector_size = 250
+        sample = 1e-5
+        negative = 15
+        epochs = 10
+        sg = 0  # CBOW is okay when data is large
+    else:
+        # Very large corpus
+        window = 12
+        vector_size = 300
+        sample = 1e-6
+        negative = 20
+        epochs = 5
+        sg = 0
+    return window, vector_size, sample, negative, epochs, sg
+  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
+                    vector_size=None,sample=None,negative=None,epochs=None,sg=None):
+    jsonFile = ""
+    jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
+    if not jsonFile:
+        print("No corpus to train")
+        return
+    cores = multiprocessing.cpu_count()
+    combinedCorpus = []
+    for key in jsonFile:
+      combinedCorpus.extend(jsonFile[key])
+    # detect phrase before choosing parameters
+    phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
+    bigram = Phraser(phrases)
+    combinedCorpus = [bigram[sent] for sent in combinedCorpus]
+    if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
+      window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
+    # # min_count=1 ensures all words are included
+    #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
+    accept = False
+    # add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
+    retries = 0
+    while not accept and retries < 3:
+      if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
+        try:
+          w2vModel = Word2Vec(
+                          min_count=1,
+                          window=window,
+                          vector_size=vector_size,
+                          sample=sample,
+                          alpha=0.03,
+                          min_alpha=0.0007,
+                          negative=negative,
+                          workers=cores-1,
+                          epochs = epochs,
+                          sg=sg)
+          w2vModel.build_vocab(combinedCorpus)
+          w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
+          accept = True
+        except Exception as e:
+          print(f"Retry #{retries+1} failed: {e}")
+          retries +=1
+      else:
+        print("no parameter to train")
+        break
+    #w2vModel.build_vocab(combinedCorpus)
+    #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
+    #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
+    #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
+    w2vModel.save(saveFolder+"/"+modelName+".model")
+    w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
+    print("done w2v")
+    #return combinedCorpus
+  def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
+    if not newCorpus:
+        raise ValueError("New corpus is empty!")
+    model = Word2Vec.load(modelPath)
+    # Phrase detection on new data
+    phrases = Phrases(newCorpus, min_count=2, threshold=10)
+    bigram = Phraser(phrases)
+    newCorpus = [bigram[sent] for sent in newCorpus]
+    # Update vocab & retrain
+    model.build_vocab(newCorpus, update=True)
+    model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
+  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
+    # might not be a meaningful keyword
+    #stopWords = ["show"]
+    # same word but just plural nouns, tense
+    simWords = [word+"s",word+"es",word+"ing",word+"ed"]
+    model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
+    results = model.most_similar(positive=[word],topn=n)
+    #removeIndex = []
+    #currN = copy.deepcopy(n)
+    '''for r in range(len(results)):
+      if len(results[r][0]) < 2:
+        removeIndex.append(results[r])
+      # remove the same word but just plural and singular noun and lower than the cos_thres
+      elif results[r][0] == word:
+        removeIndex.append(results[r])
+      elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
+        removeIndex.append(results[r])
+    for rem in removeIndex:
+      results.remove(rem)
+    while len(results)!=n and len(results) != 0:
+      moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
+      if moreNewResult not in results and len(moreNewResult[0])>1:
+        if moreNewResult[0] not in stopWords and results[0] != word:
+          results.append(moreNewResult)
+      currN +=1'''
+    return results
+  # add more data to existing word2vec model
+  def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
+    if not newCorpus:
+        raise ValueError("New corpus is empty!")
+    model = Word2Vec.load(modelPath)
+    # Phrase detection on new data
+    phrases = Phrases(newCorpus, min_count=2, threshold=10)
+    bigram = Phraser(phrases)
+    newCorpus = [bigram[sent] for sent in newCorpus]
+    # Update vocab & retrain
+    model.build_vocab(newCorpus, update=True)
+    model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
+    # Save updated model
+    if saveFolder:
+        os.makedirs(saveFolder, exist_ok=True)
+        name = os.path.basename(modelPath).replace(".model", "_updated.model")
+        model.save(f"{saveFolder}/{name}")
+        print(f"🔁 Model updated and saved to {saveFolder}/{name}")
+    else:
+        model.save(modelPath)
+        print(f"🔁 Model updated and overwritten at {modelPath}")
+  # adding our model into spacy
+  # this deals with command line; but instead of using it, we write python script to run command line
+  def loadWordVec(self,modelName,wordVec):
+    # modelName is the name you want to save into spacy
+    # wordVec is the trained word2vec in txt format
+    subprocess.run([sys.executable,
+                    "-m",
+                    "spacy",
+                    "init-model",
+                    "en",
+                    modelName, # this modelName comes from the saved modelName of function trainWord2Vec
+                    "--vectors-loc",
+                    wordVec])
+    print("done")

README.md CHANGED Viewed

@@ -1,74 +1,80 @@
 ---
-setup: bash setup.sh
-title: MtDNALocation
-emoji: 📊
 colorFrom: blue
-colorTo: purple
 sdk: gradio
-sdk_version: 5.25.0
 app_file: app.py
 pinned: false
-license: mit
-short_description: mtDNA Location Classification tool
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-# Installation
-## Set up environments and start GUI:
-```bash
-git clone https://github.com/Open-Access-Bio-Data/mtDNA-Location-Classifier.git
-```
-If installed using mamba (recommended):
-```bash
-mamba env create -f env.yaml
-```
-If not, check current python version in terminal and make sure that it is python version 3.10, then run
-```bash
-pip install -r requirements.txt
-```
-To start the programme, run this in terminal:
-```bash
-python app.py
-```
-Then follow its instructions
-# Descriptions:
-mtDNA-Location-Classifier uses [Gradio](https://www.gradio.app/docs) to handle the front-end interactions.
-The programme takes **an accession number** (an NCBI GenBank/nuccore identifier) as input and returns the likely origin of the sequence through `classify_sample_location_cached(accession=accession_number)`. This function wraps around a pipeline that proceeds as follow:
-## Steps 1-3: Check and retrieve base materials: the Pubmed ID, isolate, DOI and text:
-- Which are respectively:
-### Step 1: pubmed_ids and isolates
-        `get_info_from accession(accession=accession_number)`
-    - Current input is a string of `accession_number` and output are two lists, one of PUBMED IDs and one of isolate(s).
-    - Which look through the metadata of the sequence with `accession_number` and extract `PUBMED ID` if available or `isolate` information.
-    - The presence of PUBMED ID is currently important for the retrieval of texts in the next steps, which are eventually used by method 4.1 (question-answering) and 4.2 (infer from haplogroup)
-    - Some sequences might not have `isolate` info but its availibity is optional. (as they might be used by method 4.1 and 4.2 as alternative)
-### Step 2: dois
-        `get_doi_from_pubmed_id(pubmed_ids = pubmed_ids)`
-    - Input is a list of PUBMED IDs of the sequence with `accession_number` (retrieved from previous step) and output is a dictionary with keys = PUBMED IDs and values = according DOIs.
-    - The pubmed_ids are retrieved from the `get_info_from accession(accession=accession_number)` mentioned above.
-    - The DOIs will be passed down to dependent functions to extract texts of publications to pass on to method 4.1 and 4.2
-### Step 3: get text
-        `get_paper_text(dois = dois)`
-    - Input is currently a list of dois retrieved from previous step and output is a dictionary with keys = sources (doi links or file type) (We might improve this to have other inputs in addition to just doi links - maybe files); values = texts obtained from sources.
-    - Output of this step is crucial to method 4.1 and 4.2
-## Step 4: Prediction of origin:
-### Method 4.0:
-    - The first method attempts to directly look in the metadata for information that was submitted along with the sequence. Thus, it does not require availability of PUBMED IDs/DOIs or isolates.
-    - However, this information is not always available in the submission. Thus, we use other methods (4.1,4.2) to retrieve publications through which we can extract the information of the source of mtDNA
-### Method 4.1:
-    -
-### Method 4.2:
-    -
-## More in the package
-### extraction of text from HTML
-### extraction of text from PDF

 ---
+title: mtDNA Location Classifier 🧬
+emoji: 🧬
 colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.38.2
 app_file: app.py
 pinned: false
 ---
+# mtDNA Location Classifier
+## Publish faster. Analyze smarter.
+Are you dealing with **incomplete mtDNA metadata** (like country, ethnicity, sample type)?
+This tool helps researchers like you generate **clean, structured labels** — ready to use for your paper.
+---
+## What You’ll Get:
+- Inference from sequence ID alone
+- Handles hard edge cases (e.g no clear PubMedID, Direct Submission, etc.)
+- Clear sample type, country, and more (ethnicity, phenotype, etc.)
+- Excel export with citations
+- Feedback-based refund policy
+---
+## Free Tier
+- 30 free samples — no email needed
+- +20 bonus samples + Excel file when you enter your email
+- Don’t like the result? Tell us why — we won’t count the bad ones (email required)
+---
+## Pricing — Pay As You Go (DIY)
+| Case Type   | Price/Sample | Output                                 |
+|-------------|--------------|----------------------------------------|
+| Normal      | $0.10        | Sample Type + Country                  |
+| Edge        | $1.00        | Sample Type + Country                  |
+| Niche       | $2.00        | Sample Type + Country + 1 Custom Label |
+---
+## Batch Discount (1000+ Samples)
+- **Normal Output** → $100 total ($0.10/sample)
+  Unsatisfied samples? We’ll refund them.
+- **Niche Output** → $500 total ($0.50/sample)
+  Includes an extra label like ethnicity or phenotype.
+---
+## Early User Bonus (Limited!)
+Are you one of our **first 10 paying users**?
+Just type `early_user` in your email.
+You'll get **20% lifetime discount** on every plan — forever.
+We’ll apply this automatically so you don’t have to calculate anything.
+## Our Mission Building This
+Give clean, high-quality, open-access biological datasets that save researchers time and improve scientific reproducibility.
+Build the world's clean, AI-driven open metadata source for biological research.
+---
+## Try It Now
+Paste your sequence ID on our demo:
+[Try the Classifier](https://huggingface.co/spaces/VyLala/mtDNALocation)
+Need help or bulk analysis?
+[[email protected]]

README_OLD_VERSION.md ADDED Viewed

	@@ -0,0 +1,74 @@

+---
+setup: bash setup.sh
+title: MtDNALocation
+emoji: 📊
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.25.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: mtDNA Location Classification tool
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Installation
+## Set up environments and start GUI:
+```bash
+git clone https://github.com/Open-Access-Bio-Data/mtDNA-Location-Classifier.git
+```
+If installed using mamba (recommended):
+```bash
+mamba env create -f env.yaml
+```
+If not, check current python version in terminal and make sure that it is python version 3.10, then run
+```bash
+pip install -r requirements.txt
+```
+To start the programme, run this in terminal:
+```bash
+python app.py
+```
+Then follow its instructions
+# Descriptions:
+mtDNA-Location-Classifier uses [Gradio](https://www.gradio.app/docs) to handle the front-end interactions.
+The programme takes **an accession number** (an NCBI GenBank/nuccore identifier) as input and returns the likely origin of the sequence through `classify_sample_location_cached(accession=accession_number)`. This function wraps around a pipeline that proceeds as follow:
+## Steps 1-3: Check and retrieve base materials: the Pubmed ID, isolate, DOI and text:
+- Which are respectively:
+### Step 1: pubmed_ids and isolates
+        `get_info_from accession(accession=accession_number)`
+    - Current input is a string of `accession_number` and output are two lists, one of PUBMED IDs and one of isolate(s).
+    - Which look through the metadata of the sequence with `accession_number` and extract `PUBMED ID` if available or `isolate` information.
+    - The presence of PUBMED ID is currently important for the retrieval of texts in the next steps, which are eventually used by method 4.1 (question-answering) and 4.2 (infer from haplogroup)
+    - Some sequences might not have `isolate` info but its availibity is optional. (as they might be used by method 4.1 and 4.2 as alternative)
+### Step 2: dois
+        `get_doi_from_pubmed_id(pubmed_ids = pubmed_ids)`
+    - Input is a list of PUBMED IDs of the sequence with `accession_number` (retrieved from previous step) and output is a dictionary with keys = PUBMED IDs and values = according DOIs.
+    - The pubmed_ids are retrieved from the `get_info_from accession(accession=accession_number)` mentioned above.
+    - The DOIs will be passed down to dependent functions to extract texts of publications to pass on to method 4.1 and 4.2
+### Step 3: get text
+        `get_paper_text(dois = dois)`
+    - Input is currently a list of dois retrieved from previous step and output is a dictionary with keys = sources (doi links or file type) (We might improve this to have other inputs in addition to just doi links - maybe files); values = texts obtained from sources.
+    - Output of this step is crucial to method 4.1 and 4.2
+## Step 4: Prediction of origin:
+### Method 4.0:
+    - The first method attempts to directly look in the metadata for information that was submitted along with the sequence. Thus, it does not require availability of PUBMED IDs/DOIs or isolates.
+    - However, this information is not always available in the submission. Thus, we use other methods (4.1,4.2) to retrieve publications through which we can extract the information of the source of mtDNA
+### Method 4.1:
+    -
+### Method 4.2:
+    -
+## More in the package
+### extraction of text from HTML
+### extraction of text from PDF

app.py CHANGED Viewed

@@ -1,697 +1,793 @@
-import gradio as gr
-import mtdna_backend
-import json
-import data_preprocess, model, pipeline
-import os
-import hashlib
-import threading
-# Gradio UI
-#stop_flag = gr.State(value=False)
-class StopFlag:
-    def __init__(self):
-        self.value = False
-global_stop_flag = StopFlag()  # Shared between run + stop
-with gr.Blocks() as interface:
-    gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
-    #inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
-    user_email = gr.Textbox(label="📧 Your email (used to track free quota)")
-    usage_display = gr.Markdown("", visible=False)
-    # with gr.Group() as single_input_group:
-    #     single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
-    # with gr.Group(visible=False) as batch_input_group:
-    #     raw_text = gr.Textbox(label="🧬 Paste Accession Numbers (e.g., MF362736.1,MF362738.1,KU131308,MW291678)")
-    #     resume_file = gr.File(label="🗃️ Previously saved Excel output (optional)", file_types=[".xlsx"], interactive=True)
-    #     gr.HTML("""<a href="https://drive.google.com/file/d/1t-TFeIsGVu5Jh3CUZS-VE9jQWzNFCs_c/view?usp=sharing" download target="_blank">Download Example CSV Format</a>""")
-    #     gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing&ouid=112390323314156876153&rtpof=true&sd=true" download target="_blank">Download Example Excel Format</a>""")
-    #     file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
-    raw_text = gr.Textbox(label="🧚 Input Accession Number(s) (single (KU131308) or comma-separated (e.g., MF362736.1,MF362738.1,KU131308,MW291678))")
-    #resume_file = gr.File(label="🗃️ Previously saved Excel output (optional)", file_types=[".xlsx"], interactive=True)
-    gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing" download target="_blank">Download Example Excel Format</a>""")
-    file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True)
-    with gr.Row():
-        run_button = gr.Button("🔍 Submit and Classify")
-        stop_button = gr.Button("❌ Stop Batch", visible=True)
-        reset_button = gr.Button("🔄 Reset")
-    status = gr.Markdown(visible=False)
-    with gr.Group(visible=False) as results_group:
-      # with gr.Accordion("Open to See the Result", open=False) as results:
-      #     with gr.Row():
-      #         output_summary = gr.Markdown(elem_id="output-summary")
-      #         output_flag = gr.Markdown(elem_id="output-flag")
-      #     gr.Markdown("---")
-      with gr.Accordion("Open to See the Output Table", open=False) as table_accordion:
-          output_table = gr.HTML(render=True)
-      #with gr.Row():
-          #output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
-          #download_button = gr.Button("⬇️ Download Output")
-      #download_file = gr.File(label="Download File Here",visible=False)
-      download_file = gr.File(label="Download File Here", visible=False, interactive=True)
-      progress_box = gr.Textbox(label="Live Processing Log", lines=20, interactive=False)
-      gr.Markdown("---")
-      gr.Markdown("### 💬 Feedback (required)")
-      q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
-      q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
-      contact = gr.Textbox(label="📧 Your email or institution (optional)")
-      submit_feedback = gr.Button("✅ Submit Feedback")
-      feedback_status = gr.Markdown()
-    # Functions
-    # def toggle_input_mode(mode):
-    #     if mode == "Single Accession":
-    #         return gr.update(visible=True), gr.update(visible=False)
-    #     else:
-    #         return gr.update(visible=False), gr.update(visible=True)
-    def classify_with_loading():
-        return gr.update(value="⏳ Please wait... processing...",visible=True)  # Show processing message
-    # def classify_dynamic(single_accession, file, text, resume, email, mode):
-    #     if mode == "Single Accession":
-    #         return classify_main(single_accession)  + (gr.update(visible=False),)
-    #     else:
-    #         #return summarize_batch(file, text) + (gr.update(visible=False),)  # Hide processing message
-    #         return classify_mulAcc(file, text, resume) + (gr.update(visible=False),)  # Hide processing message
-    # Logging helpers defined early to avoid NameError
-    # def classify_dynamic(single_accession, file, text, resume, email, mode):
-    #   if mode == "Single Accession":
-    #       return classify_main(single_accession) + (gr.update(value="", visible=False),)
-    #   else:
-    #       return classify_mulAcc(file, text, resume, email, log_callback=real_time_logger, log_collector=log_collector)
-    # for single accession
-    # def classify_main(accession):
-    #     #table, summary, labelAncient_Modern, explain_label = mtdna_backend.summarize_results(accession)
-    #     table = mtdna_backend.summarize_results(accession)
-    #     #flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
-    #     return (
-    #         #table,
-    #         make_html_table(table),
-    #         # summary,
-    #         # flag_output,
-    #         gr.update(visible=True),
-    #         gr.update(visible=False),
-    #         gr.update(visible=False)
-    #     )
-    #stop_flag = gr.State(value=False)
-    #stop_flag = StopFlag()
-    # def stop_batch(stop_flag):
-    #   stop_flag.value = True
-    #   return gr.update(value="❌ Stopping...", visible=True), stop_flag
-    def stop_batch():
-      global_stop_flag.value = True
-      return gr.update(value="❌ Stopping...", visible=True)
-    # def threaded_batch_runner(file, text, email):
-    #   global_stop_flag.value = False
-    #   log_lines = []
-    #   def update_log(line):
-    #       log_lines.append(line)
-    #       yield (
-    #           gr.update(visible=False),  # output_table (not yet)
-    #           gr.update(visible=False),  # results_group
-    #           gr.update(visible=False),  # download_file
-    #           gr.update(visible=False),  # usage_display
-    #           gr.update(value="⏳ Still processing...", visible=True),  # status
-    #           gr.update(value="\n".join(log_lines))  # progress_box
-    #       )
-    #   # Start a dummy update to say "Starting..."
-    #   yield from update_log("🚀 Starting batch processing...")
-    #   rows, file_path, count, final_log, warning = mtdna_backend.summarize_batch(
-    #       file=file,
-    #       raw_text=text,
-    #       resume_file=None,
-    #       user_email=email,
-    #       stop_flag=global_stop_flag,
-    #       yield_callback=lambda line: (yield from update_log(line))
-    #   )
-    #   html = make_html_table(rows)
-    #   file_update = gr.update(value=file_path, visible=True) if os.path.exists(file_path) else gr.update(visible=False)
-    #   usage_or_warning_text = f"**{count}** samples used by this email." if email.strip() else warning
-    #   yield (
-    #       html,
-    #       gr.update(visible=True),        # results_group
-    #       file_update,                    # download_file
-    #       gr.update(value=usage_or_warning_text, visible=True),
-    #       gr.update(value="✅ Done", visible=True),
-    #       gr.update(value=final_log)
-    #   )
-    # def threaded_batch_runner(file=None, text="", email=""):
-    #   print("📧 EMAIL RECEIVED:", email)
-    #   import tempfile
-    #   from mtdna_backend import (
-    #       extract_accessions_from_input,
-    #       summarize_results,
-    #       save_to_excel,
-    #       hash_user_id,
-    #       increment_usage,
-    #   )
-    #   import os
-    #   global_stop_flag.value = False  # reset stop flag
-    #   tmp_dir = tempfile.mkdtemp()
-    #   output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
-    #   limited_acc = 50 + (10 if email.strip() else 0)
-    #   # Step 1: Parse input
-    #   accessions, error = extract_accessions_from_input(file, text)
-    #   print(accessions)
-    #   if error:
-    #       yield (
-    #           "",                          # output_table
-    #           gr.update(visible=False),   # results_group
-    #           gr.update(visible=False),   # download_file
-    #           "",                          # usage_display
-    #           "❌ Error",                  # status
-    #           str(error)                        # progress_box
-    #       )
-    #       return
-    #   total = len(accessions)
-    #   if total > limited_acc:
-    #       accessions = accessions[:limited_acc]
-    #       warning = f"⚠️ Only processing first {limited_acc} accessions."
-    #   else:
-    #       warning = f"✅ All {total} accessions will be processed."
-    #   all_rows = []
-    #   processed_accessions = 0  # ✅ tracks how many accessions were processed
-    #   email_tracked = False
-    #   log_lines = []
-    #   # Step 2: Loop through accessions
-    #   for i, acc in enumerate(accessions):
-    #       if global_stop_flag.value:
-    #           log_lines.append(f"🛑 Stopped at {acc} ({i+1}/{total})")
-    #           usage_text = ""
-    #           if email.strip() and not email_tracked:
-    #               # user_hash = hash_user_id(email)
-    #               # usage_count = increment_usage(user_hash, len(all_rows))
-    #               print("print(processed_accessions at stop)  ",processed_accessions)
-    #               usage_count = increment_usage(email, processed_accessions)
-    #               email_tracked = True
-    #               usage_text = f"**{usage_count}** samples used by this email. Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
-    #           else:
-    #               usage_text = f"The limited accession is 50. The user has used {processed_accessions}, and only {50-processed_accessions} left."
-    #           yield (
-    #               make_html_table(all_rows),
-    #               gr.update(visible=True),
-    #               gr.update(value=output_file_path, visible=True),
-    #               gr.update(value=usage_text, visible=True),
-    #               "🛑 Stopped",
-    #               "\n".join(log_lines)
-    #           )
-    #           return
-    #       log_lines.append(f"[{i+1}/{total}] Processing {acc}")
-    #       yield (
-    #           make_html_table(all_rows),
-    #           gr.update(visible=True),
-    #           gr.update(visible=False),
-    #           "",
-    #           "⏳ Processing...",
-    #           "\n".join(log_lines)
-    #       )
-    #       try:
-    #           print(acc)
-    #           rows = summarize_results(acc)
-    #           all_rows.extend(rows)
-    #           processed_accessions += 1  # ✅ count only successful accessions
-    #           save_to_excel(all_rows, "", "", output_file_path, is_resume=False)
-    #           log_lines.append(f"✅ Processed {acc} ({i+1}/{total})")
-    #       except Exception as e:
-    #           log_lines.append(f"❌ Failed to process {acc}: {e}")
-    #       yield (
-    #           make_html_table(all_rows),
-    #           gr.update(visible=True),
-    #           gr.update(visible=False),
-    #           "",
-    #           "⏳ Processing...",
-    #           "\n".join(log_lines)
-    #       )
-    #   # Final update
-    #   usage_text = ""
-    #   if email.strip() and not email_tracked:
-    #       # user_hash = hash_user_id(email)
-    #       # usage_count = increment_usage(user_hash, len(all_rows))
-    #       print("print(processed_accessions final)  ",processed_accessions)
-    #       usage_count = increment_usage(email, processed_accessions)
-    #       usage_text = f"**{usage_count}** samples used by this email. Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
-    #   elif not email.strip():
-    #       usage_text = f"The limited accession is 50. The user has used {processed_accessions}, and only {50-processed_accessions} left."
-    #   yield (
-    #       make_html_table(all_rows),
-    #       gr.update(visible=True),
-    #       gr.update(value=output_file_path, visible=True),
-    #       gr.update(value=usage_text, visible=True),
-    #       "✅ Done",
-    #       "\n".join(log_lines)
-    #   )
-    def threaded_batch_runner(file=None, text="", email=""):
-        print("📧 EMAIL RECEIVED:", repr(email))
-        import tempfile
-        from mtdna_backend import (
-            extract_accessions_from_input,
-            summarize_results,
-            save_to_excel,
-            increment_usage,
-        )
-        import os
-        global_stop_flag.value = False  # reset stop flag
-        tmp_dir = tempfile.mkdtemp()
-        output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
-        limited_acc = 30 + (20 if email.strip() else 0)
-        # Step 1: Parse input
-        accessions, error = extract_accessions_from_input(file, text)
-        print("🧪 Accessions received:", accessions)
-        if error:
-            yield (
-                "",                          # output_table
-                gr.update(visible=False),   # results_group
-                gr.update(visible=False),   # download_file
-                "",                          # usage_display
-                "❌ Error",                  # status
-                str(error)                  # progress_box
-            )
-            return
-        total = len(accessions)
-        if total > limited_acc:
-            accessions = accessions[:limited_acc]
-            warning = f"⚠️ Only processing first {limited_acc} accessions."
-        else:
-            warning = f"✅ All {total} accessions will be processed."
-        all_rows = []
-        processed_accessions = 0  # ✅ track successful accessions
-        email_tracked = False
-        log_lines = []
-        if not email.strip():
-            output_file_path = None#"Write your email so that you can download the outputs."
-            log_lines.append("📥 Provide your email to receive a downloadable Excel report and get 20 more free queries.")
-        if email.strip():
-            usage_count = increment_usage(email, processed_accessions)
-            if int(usage_count) > 50:
-                log_lines.append("❌ You have reached your quota. Please contact us to unlock more.")
-                # Minimal blank yield to trigger UI rendering
-                yield (
-                    make_html_table([]),
-                    gr.update(visible=True),
-                    gr.update(visible=False),
-                    gr.update(value="", visible=True),
-                    "⛔️ Quota limit",
-                    "⛔️ Quota limit"
-                )
-                # Actual warning frame
-                yield (
-                    make_html_table([]),
-                    gr.update(visible=False),
-                    gr.update(visible=False),
-                    gr.update(value="❌ You have reached your quota. Please contact us to unlock more.", visible=True),
-                    "❌ Quota Exceeded",
-                    "\n".join(log_lines)
-                )
-                return
-        # Step 2: Loop through accessions
-        for i, acc in enumerate(accessions):
-            if global_stop_flag.value:
-                log_lines.append(f"🛑 Stopped at {acc} ({i+1}/{total})")
-                usage_text = ""
-                if email.strip() and not email_tracked:
-                    print(f"🧪 increment_usage at STOP: {email=} {processed_accessions=}")
-                    usage_count = increment_usage(email, processed_accessions)
-                    email_tracked = True
-                    usage_text = f"**{usage_count}**/50 free samples used by this email."
-                    #Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
-                else:
-                    usage_text = f"The limited accession is 30. The user has used {processed_accessions}, and only {30 - processed_accessions} left."
-                yield (
-                    make_html_table(all_rows),
-                    gr.update(visible=True),
-                    #gr.update(value=output_file_path, visible=True),
-                    gr.update(value=output_file_path, visible=bool(output_file_path)),
-                    gr.update(value=usage_text, visible=True),
-                    "🛑 Stopped",
-                    "\n".join(log_lines)
-                )
-                return
-            log_lines.append(f"[{i+1}/{total}] Processing {acc}")
-            yield (
-                make_html_table(all_rows),
-                gr.update(visible=True),
-                gr.update(visible=False),
-                "",
-                "⏳ Processing...",
-                "\n".join(log_lines)
-            )
-            try:
-                print("📄 Processing accession:", acc)
-                rows = summarize_results(acc)
-                all_rows.extend(rows)
-                processed_accessions += 1  # ✅ only count success
-                if email.strip():
-                    save_to_excel(all_rows, "", "", output_file_path, is_resume=False)
-                log_lines.append(f"✅ Processed {acc} ({i+1}/{total})")
-            except Exception as e:
-                log_lines.append(f"❌ Failed to process {acc}: {e}")
-            yield (
-                make_html_table(all_rows),
-                gr.update(visible=True),
-                gr.update(visible=False),
-                "",
-                "⏳ Processing...",
-                "\n".join(log_lines)
-            )
-        # Step 3: Final usage update
-        usage_text = ""
-        if email.strip() and not email_tracked:
-            print(f"🧪 increment_usage at END: {email=} {processed_accessions=}")
-            usage_count = increment_usage(email, processed_accessions)
-            email_tracked = True
-            usage_text = f"**{usage_count}**/50 free samples used by this email."
-            #Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
-        elif not email.strip():
-            usage_text = f"The limited accession is 30. The user has used {processed_accessions}, and only {30 - processed_accessions} left."
-        yield (
-            make_html_table(all_rows),
-            gr.update(visible=True),
-            #gr.update(value=output_file_path, visible=True),
-            gr.update(value=output_file_path, visible=bool(output_file_path)),
-            gr.update(value=usage_text, visible=True),
-            "✅ Done",
-            "\n".join(log_lines)
-        )
-    # def threaded_batch_runner(file=None, text="", email=""):
-    #   global_stop_flag.value = False
-    #   # Dummy test output that matches expected schema
-    #   return (
-    #       "<div>✅ Dummy output table</div>",   # HTML string
-    #       gr.update(visible=True),             # Group visibility
-    #       gr.update(visible=False),            # Download file
-    #       "**0** samples used.",               # Markdown
-    #       "✅ Done",                           # Status string
-    #       "Processing finished."               # Progress string
-    #   )
-    # def classify_mulAcc(file, text, resume, email, log_callback=None, log_collector=None):
-    #     stop_flag.value = False
-    #     return threaded_batch_runner(file, text, resume, email, status, stop_flag, log_callback=log_callback, log_collector=log_collector)
-    def make_html_table(rows):
-      # html = """
-      # <div style='overflow-x: auto; padding: 10px;'>
-      #     <div style='max-height: 400px; overflow-y: auto; border: 1px solid #444; border-radius: 8px;'>
-      #         <table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: #f1f1f1; background-color: #1e1e1e;'>
-      #             <thead style='position: sticky; top: 0; background-color: #2c2c2c; z-index: 1;'>
-      #                 <tr>
-      # """
-      html = """
-        <div style='overflow-x: auto; padding: 10px;'>
-            <div style='max-height: 400px; overflow-y: auto; border: 1px solid #ccc; border-radius: 8px;'>
-                <table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: inherit; background-color: inherit;'>
-        """
-      headers = ["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"]
-      html += "".join(
-          f"<th style='padding: 10px; border: 1px solid #555; text-align: left; white-space: nowrap;'>{h}</th>"
-          for h in headers
-      )
-      html += "</tr></thead><tbody>"
-      for row in rows:
-          html += "<tr>"
-          for i, col in enumerate(row):
-              header = headers[i]
-              style = "padding: 10px; border: 1px solid #555; vertical-align: top;"
-              # For specific columns like Haplogroup, force nowrap
-              if header in ["Country Explanation", "Sample Type Explanation"]:
-                style += " max-width: 400px; word-wrap: break-word; white-space: normal;"
-              elif header in ["Sample ID", "Predicted Country", "Predicted Sample Type", "Time cost"]:
-                  style += " white-space: nowrap; text-overflow: ellipsis; max-width: 200px; overflow: hidden;"
-              # if header == "Sources" and isinstance(col, str) and col.strip().lower().startswith("http"):
-              #     col = f"<a href='{col}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{col}</a>"
-              #html += f"<td style='{style}'>{col}</td>"
-              if header == "Sources" and isinstance(col, str):
-                  links = [f"<a href='{url.strip()}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{url.strip()}</a>" for url in col.strip().split("\n") if url.strip()]
-                  col = "- "+"<br>- ".join(links)
-              elif isinstance(col, str):
-                  # lines = []
-                  # for line in col.split("\n"):
-                  #     line = line.strip()
-                  #     if not line:
-                  #         continue
-                  #     if line.lower().startswith("rag_llm-"):
-                  #         content = line[len("rag_llm-"):].strip()
-                  #         line = f"{content} (Method: RAG_LLM)"
-                  #     lines.append(f"- {line}")
-                  col = col.replace("\n", "<br>")
-                  #col = col.replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
-                  #col = "<br>".join(lines)
-              html += f"<td style='{style}'>{col}</td>"
-          html += "</tr>"
-      html += "</tbody></table></div></div>"
-      return html
-    # def reset_fields():
-    #     global_stop_flag.value = False  # 💡 Add this to reset the flag
-    #     return (
-    #         #gr.update(value=""),  # single_accession
-    #         gr.update(value=""),  # raw_text
-    #         gr.update(value=None), # file_upload
-    #         #gr.update(value=None),  # resume_file
-    #         #gr.update(value="Single Accession"), # inputMode
-    #         gr.update(value=[], visible=True), # output_table
-    #         # gr.update(value="", visible=True), # output_summary
-    #         # gr.update(value="", visible=True), # output_flag
-    #         gr.update(visible=False), # status
-    #         gr.update(visible=False),  # results_group
-    #         gr.update(value="", visible=False),  # usage_display
-    #         gr.update(value="", visible=False),  # progress_box
-    #     )
-    def reset_fields():
-      global_stop_flag.value = False  # Reset the stop flag
-      return (
-          gr.update(value=""),          # raw_text
-          gr.update(value=None),        # file_upload
-          gr.update(value=[], visible=True),   # output_table
-          gr.update(value="", visible=True),   # status — reset and make visible again
-          gr.update(visible=False),     # results_group
-          gr.update(value="", visible=True),   # usage_display — reset and make visible again
-          gr.update(value="", visible=True),   # progress_box — reset AND visible!
-      )
-    #inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
-    #run_button.click(fn=classify_with_loading, inputs=[], outputs=[status])
-    # run_button.click(
-    #     fn=classify_dynamic,
-    #     inputs=[single_accession, file_upload, raw_text, resume_file,user_email,inputMode],
-    #     outputs=[output_table,
-    #     #output_summary, output_flag,
-    #     results_group, download_file, usage_display,status, progress_box]
-    # )
-    # run_button.click(
-    #     fn=threaded_batch_runner,
-    #     #inputs=[file_upload, raw_text, resume_file, user_email],
-    #     inputs=[file_upload, raw_text, user_email],
-    #     outputs=[output_table, results_group, download_file, usage_display, status, progress_box]
-    # )
-#     run_button.click(
-#     fn=threaded_batch_runner,
-#     inputs=[file_upload, raw_text, user_email],
-#     outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
-#     every=0.5  # <-- this tells Gradio to expect streaming
-# )
-    # output_table = gr.HTML()
-    # results_group = gr.Group(visible=False)
-    # download_file = gr.File(visible=False)
-    # usage_display = gr.Markdown(visible=False)
-    # status = gr.Markdown(visible=False)
-    # progress_box = gr.Textbox(visible=False)
-#     run_button.click(
-#     fn=threaded_batch_runner,
-#     inputs=[file_upload, raw_text, user_email],
-#     outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
-#     every=0.5,  # streaming enabled
-#     show_progress="full"
-# )
-    # interface.stream(
-    #     fn=threaded_batch_runner,
-    #     inputs=[file_upload, raw_text, user_email],
-    #     outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
-    #     trigger=run_button,
-    #     every=0.5,
-    #     show_progress="full",
-    # )
-    interface.queue()  # No arguments here!
-    run_button.click(
-        fn=threaded_batch_runner,
-        inputs=[file_upload, raw_text, user_email],
-        outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
-        concurrency_limit=1,  # ✅ correct in Gradio 5.x
-        queue=True,            # ✅ ensure the queue is used
-        #every=0.5
-    )
-    stop_button.click(fn=stop_batch, inputs=[], outputs=[status])
-    # reset_button.click(
-    #     #fn=reset_fields,
-    #     fn=lambda: (
-    #         gr.update(value=""), gr.update(value=""), gr.update(value=None), gr.update(value=None), gr.update(value="Single Accession"),
-    #         gr.update(value=[], visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False), gr.update(value="", visible=False)
-    #     ),
-    #     inputs=[],
-    #     outputs=[
-    #         single_accession, raw_text, file_upload, resume_file,inputMode,
-    #         output_table,# output_summary, output_flag,
-    #         status, results_group, usage_display, progress_box
-    #     ]
-    # )
-    #stop_button.click(fn=lambda sf: (gr.update(value="❌ Stopping...", visible=True), setattr(sf, "value", True) or sf), inputs=[gr.State(stop_flag)], outputs=[status, gr.State(stop_flag)])
-    reset_button.click(
-        fn=reset_fields,
-        inputs=[],
-        #outputs=[raw_text, file_upload, resume_file, output_table, status, results_group, usage_display, progress_box]
-        outputs=[raw_text, file_upload, output_table, status, results_group, usage_display, progress_box]
-    )
-    # download_button.click(
-    #   fn=mtdna_backend.save_batch_output,
-    #   #inputs=[output_table, output_summary, output_flag, output_type],
-    #   inputs=[output_table, output_type],
-    #   outputs=[download_file])
-    # submit_feedback.click(
-    #     fn=mtdna_backend.store_feedback_to_google_sheets,
-    #     inputs=[single_accession, q1, q2, contact], outputs=feedback_status
-    # )
-    submit_feedback.click(
-        fn=mtdna_backend.store_feedback_to_google_sheets,
-        inputs=[raw_text, q1, q2, contact],
-        outputs=[feedback_status]
-    )
-    gr.HTML("""
-    <style>
-      body, html {
-          background-color: #121212 !important;
-          color: #ffffff !important;
-      }
-      .gradio-container, .gr-block, .gr-box, textarea, input, select, .prose, .prose * {
-          background-color: #1e1e1e !important;
-          color: #ffffff !important;
-          border-color: #333 !important;
-      }
-      textarea::placeholder,
-      input::placeholder {
-          color: #aaa !important;
-      }
-      button {
-          background-color: #2d2d2d !important;
-          color: #fff !important;
-          border: 1px solid #444 !important;
-      }
-      a {
-          color: #4ea1f3 !important;
-      }
-    </style>
-    """)
-    #     # Custom CSS styles
-    # gr.HTML("""
-    # <style>
-    #   /* Ensures both sections are equally spaced with the same background size */
-    #   #output-summary, #output-flag {
-    #       background-color: #f0f4f8; /* Light Grey for both */
-    #       padding: 20px;
-    #       border-radius: 10px;
-    #       margin-top: 10px;
-    #       width: 100%; /* Ensure full width */
-    #       min-height: 150px; /* Ensures both have a minimum height */
-    #       box-sizing: border-box; /* Prevents padding from increasing size */
-    #       display: flex;
-    #       flex-direction: column;
-    #       justify-content: space-between;
-    #   }
-    #   /* Specific background colors */
-    #   #output-summary {
-    #       background-color: #434a4b;
-    #   }
-    #   #output-flag {
-    #       background-color: #141616;
-    #   }
-    #   /* Ensuring they are in a row and evenly spaced */
-    #   .gradio-row {
-    #       display: flex;
-    #       justify-content: space-between;
-    #       width: 100%;
-    #   }
-    # </style>
-    # """)
 interface.launch(share=True,debug=True)

+import gradio as gr
+import mtdna_backend
+import json
+import data_preprocess, model, pipeline
+import os
+import hashlib
+import threading
+# Gradio UI
+#stop_flag = gr.State(value=False)
+class StopFlag:
+    def __init__(self):
+        self.value = False
+global_stop_flag = StopFlag()  # Shared between run + stop
+with open("offer.html", "r", encoding="utf-8") as f:
+    pricing_html = f.read()
+with gr.Blocks() as interface:
+    with gr.Tab("CURIOUS ABOUT THIS PRODUCT?"):
+        gr.HTML(value=pricing_html)
+    with gr.Tab("🧬 Classifier"):
+        gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
+        #inputMode = gr.Radio(choices=["Single Accession", "Batch Input"], value="Single Accession", label="Choose Input Mode")
+        user_email = gr.Textbox(label="📧 Your email (used to track free quota)")
+        usage_display = gr.Markdown("", visible=False)
+        # with gr.Group() as single_input_group:
+        #     single_accession = gr.Textbox(label="Enter Single Accession (e.g., KU131308)")
+        # with gr.Group(visible=False) as batch_input_group:
+        #     raw_text = gr.Textbox(label="🧬 Paste Accession Numbers (e.g., MF362736.1,MF362738.1,KU131308,MW291678)")
+        #     resume_file = gr.File(label="🗃️ Previously saved Excel output (optional)", file_types=[".xlsx"], interactive=True)
+        #     gr.HTML("""<a href="https://drive.google.com/file/d/1t-TFeIsGVu5Jh3CUZS-VE9jQWzNFCs_c/view?usp=sharing" download target="_blank">Download Example CSV Format</a>""")
+        #     gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing&ouid=112390323314156876153&rtpof=true&sd=true" download target="_blank">Download Example Excel Format</a>""")
+        #     file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True, elem_id="file-upload-box")
+        raw_text = gr.Textbox(label="🧚 Input Accession Number(s) (single (KU131308) or comma-separated (e.g., MF362736.1,MF362738.1,KU131308,MW291678))")
+        #resume_file = gr.File(label="🗃️ Previously saved Excel output (optional)", file_types=[".xlsx"], interactive=True)
+        gr.HTML("""<a href="https://docs.google.com/spreadsheets/d/1lKqPp17EfHsshJGZRWEpcNOZlGo3F5qU/edit?usp=sharing" download target="_blank">Download Example Excel Format</a>""")
+        file_upload = gr.File(label="📁 Or Upload CSV/Excel File", file_types=[".csv", ".xlsx"], interactive=True)
+        with gr.Row():
+            run_button = gr.Button("🔍 Submit and Classify")
+            stop_button = gr.Button("❌ Stop Batch", visible=True)
+            reset_button = gr.Button("🔄 Reset")
+        status = gr.Markdown(visible=False)
+        with gr.Group(visible=False) as results_group:
+          # with gr.Accordion("Open to See the Result", open=False) as results:
+          #     with gr.Row():
+          #         output_summary = gr.Markdown(elem_id="output-summary")
+          #         output_flag = gr.Markdown(elem_id="output-flag")
+          #     gr.Markdown("---")
+          with gr.Accordion("Open to See the Output Table", open=False) as table_accordion:
+              output_table = gr.HTML(render=True)
+          #with gr.Row():
+              #output_type = gr.Dropdown(choices=["Excel", "JSON", "TXT"], label="Select Output Format", value="Excel")
+              #download_button = gr.Button("⬇️ Download Output")
+          #download_file = gr.File(label="Download File Here",visible=False)
+          report_button = gr.Button("Report")
+          report_textbox = gr.Textbox(
+            label="Describe the issue",
+            lines=4,
+            placeholder="e.g. DQ981467: it gives me unknown when I can in fact search it on NCBI \n DQ981467: same reason as above",
+            visible=False)
+          submit_report_button = gr.Button("Submit", visible=False)
+          status_report = gr.Markdown(visible=False)
+          download_file = gr.File(label="Download File Here", visible=False, interactive=True)
+          progress_box = gr.Textbox(label="Live Processing Log", lines=20, interactive=False)
+          gr.Markdown("---")
+          gr.Markdown("### 💬 Feedback (required)")
+          q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
+          q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
+          contact = gr.Textbox(label="📧 Your email or institution (optional)")
+          submit_feedback = gr.Button("✅ Submit Feedback")
+          feedback_status = gr.Markdown()
+        # Functions
+        # def toggle_input_mode(mode):
+        #     if mode == "Single Accession":
+        #         return gr.update(visible=True), gr.update(visible=False)
+        #     else:
+        #         return gr.update(visible=False), gr.update(visible=True)
+        def classify_with_loading():
+            return gr.update(value="⏳ Please wait... processing...",visible=True)  # Show processing message
+        # def classify_dynamic(single_accession, file, text, resume, email, mode):
+        #     if mode == "Single Accession":
+        #         return classify_main(single_accession)  + (gr.update(visible=False),)
+        #     else:
+        #         #return summarize_batch(file, text) + (gr.update(visible=False),)  # Hide processing message
+        #         return classify_mulAcc(file, text, resume) + (gr.update(visible=False),)  # Hide processing message
+        # Logging helpers defined early to avoid NameError
+        # def classify_dynamic(single_accession, file, text, resume, email, mode):
+        #   if mode == "Single Accession":
+        #       return classify_main(single_accession) + (gr.update(value="", visible=False),)
+        #   else:
+        #       return classify_mulAcc(file, text, resume, email, log_callback=real_time_logger, log_collector=log_collector)
+        # for single accession
+        # def classify_main(accession):
+        #     #table, summary, labelAncient_Modern, explain_label = mtdna_backend.summarize_results(accession)
+        #     table = mtdna_backend.summarize_results(accession)
+        #     #flag_output = f"### 🏺 Ancient/Modern Flag\n**{labelAncient_Modern}**\n\n_Explanation:_ {explain_label}"
+        #     return (
+        #         #table,
+        #         make_html_table(table),
+        #         # summary,
+        #         # flag_output,
+        #         gr.update(visible=True),
+        #         gr.update(visible=False),
+        #         gr.update(visible=False)
+        #     )
+        #stop_flag = gr.State(value=False)
+        #stop_flag = StopFlag()
+        # def stop_batch(stop_flag):
+        #   stop_flag.value = True
+        #   return gr.update(value="❌ Stopping...", visible=True), stop_flag
+        def stop_batch():
+          global_stop_flag.value = True
+          return gr.update(value="❌ Stopping...", visible=True)
+        # def threaded_batch_runner(file, text, email):
+        #   global_stop_flag.value = False
+        #   log_lines = []
+        #   def update_log(line):
+        #       log_lines.append(line)
+        #       yield (
+        #           gr.update(visible=False),  # output_table (not yet)
+        #           gr.update(visible=False),  # results_group
+        #           gr.update(visible=False),  # download_file
+        #           gr.update(visible=False),  # usage_display
+        #           gr.update(value="⏳ Still processing...", visible=True),  # status
+        #           gr.update(value="\n".join(log_lines))  # progress_box
+        #       )
+        #   # Start a dummy update to say "Starting..."
+        #   yield from update_log("🚀 Starting batch processing...")
+        #   rows, file_path, count, final_log, warning = mtdna_backend.summarize_batch(
+        #       file=file,
+        #       raw_text=text,
+        #       resume_file=None,
+        #       user_email=email,
+        #       stop_flag=global_stop_flag,
+        #       yield_callback=lambda line: (yield from update_log(line))
+        #   )
+        #   html = make_html_table(rows)
+        #   file_update = gr.update(value=file_path, visible=True) if os.path.exists(file_path) else gr.update(visible=False)
+        #   usage_or_warning_text = f"**{count}** samples used by this email." if email.strip() else warning
+        #   yield (
+        #       html,
+        #       gr.update(visible=True),        # results_group
+        #       file_update,                    # download_file
+        #       gr.update(value=usage_or_warning_text, visible=True),
+        #       gr.update(value="✅ Done", visible=True),
+        #       gr.update(value=final_log)
+        #   )
+        # def threaded_batch_runner(file=None, text="", email=""):
+        #   print("📧 EMAIL RECEIVED:", email)
+        #   import tempfile
+        #   from mtdna_backend import (
+        #       extract_accessions_from_input,
+        #       summarize_results,
+        #       save_to_excel,
+        #       hash_user_id,
+        #       increment_usage,
+        #   )
+        #   import os
+        #   global_stop_flag.value = False  # reset stop flag
+        #   tmp_dir = tempfile.mkdtemp()
+        #   output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
+        #   limited_acc = 50 + (10 if email.strip() else 0)
+        #   # Step 1: Parse input
+        #   accessions, error = extract_accessions_from_input(file, text)
+        #   print(accessions)
+        #   if error:
+        #       yield (
+        #           "",                          # output_table
+        #           gr.update(visible=False),   # results_group
+        #           gr.update(visible=False),   # download_file
+        #           "",                          # usage_display
+        #           "❌ Error",                  # status
+        #           str(error)                        # progress_box
+        #       )
+        #       return
+        #   total = len(accessions)
+        #   if total > limited_acc:
+        #       accessions = accessions[:limited_acc]
+        #       warning = f"⚠️ Only processing first {limited_acc} accessions."
+        #   else:
+        #       warning = f"✅ All {total} accessions will be processed."
+        #   all_rows = []
+        #   processed_accessions = 0  # ✅ tracks how many accessions were processed
+        #   email_tracked = False
+        #   log_lines = []
+        #   # Step 2: Loop through accessions
+        #   for i, acc in enumerate(accessions):
+        #       if global_stop_flag.value:
+        #           log_lines.append(f"🛑 Stopped at {acc} ({i+1}/{total})")
+        #           usage_text = ""
+        #           if email.strip() and not email_tracked:
+        #               # user_hash = hash_user_id(email)
+        #               # usage_count = increment_usage(user_hash, len(all_rows))
+        #               print("print(processed_accessions at stop)  ",processed_accessions)
+        #               usage_count = increment_usage(email, processed_accessions)
+        #               email_tracked = True
+        #               usage_text = f"**{usage_count}** samples used by this email. Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
+        #           else:
+        #               usage_text = f"The limited accession is 50. The user has used {processed_accessions}, and only {50-processed_accessions} left."
+        #           yield (
+        #               make_html_table(all_rows),
+        #               gr.update(visible=True),
+        #               gr.update(value=output_file_path, visible=True),
+        #               gr.update(value=usage_text, visible=True),
+        #               "🛑 Stopped",
+        #               "\n".join(log_lines)
+        #           )
+        #           return
+        #       log_lines.append(f"[{i+1}/{total}] Processing {acc}")
+        #       yield (
+        #           make_html_table(all_rows),
+        #           gr.update(visible=True),
+        #           gr.update(visible=False),
+        #           "",
+        #           "⏳ Processing...",
+        #           "\n".join(log_lines)
+        #       )
+        #       try:
+        #           print(acc)
+        #           rows = summarize_results(acc)
+        #           all_rows.extend(rows)
+        #           processed_accessions += 1  # ✅ count only successful accessions
+        #           save_to_excel(all_rows, "", "", output_file_path, is_resume=False)
+        #           log_lines.append(f"✅ Processed {acc} ({i+1}/{total})")
+        #       except Exception as e:
+        #           log_lines.append(f"❌ Failed to process {acc}: {e}")
+        #       yield (
+        #           make_html_table(all_rows),
+        #           gr.update(visible=True),
+        #           gr.update(visible=False),
+        #           "",
+        #           "⏳ Processing...",
+        #           "\n".join(log_lines)
+        #       )
+        #   # Final update
+        #   usage_text = ""
+        #   if email.strip() and not email_tracked:
+        #       # user_hash = hash_user_id(email)
+        #       # usage_count = increment_usage(user_hash, len(all_rows))
+        #       print("print(processed_accessions final)  ",processed_accessions)
+        #       usage_count = increment_usage(email, processed_accessions)
+        #       usage_text = f"**{usage_count}** samples used by this email. Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
+        #   elif not email.strip():
+        #       usage_text = f"The limited accession is 50. The user has used {processed_accessions}, and only {50-processed_accessions} left."
+        #   yield (
+        #       make_html_table(all_rows),
+        #       gr.update(visible=True),
+        #       gr.update(value=output_file_path, visible=True),
+        #       gr.update(value=usage_text, visible=True),
+        #       "✅ Done",
+        #       "\n".join(log_lines)
+        #   )
+        def threaded_batch_runner(file=None, text="", email=""):
+            print("📧 EMAIL RECEIVED:", repr(email))
+            import tempfile
+            from mtdna_backend import (
+                extract_accessions_from_input,
+                summarize_results,
+                save_to_excel,
+                increment_usage,
+            )
+            import os
+            global_stop_flag.value = False  # reset stop flag
+            tmp_dir = tempfile.mkdtemp()
+            output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
+            #output_file_path = "/mnt/data/batch_output_live.xlsx"
+            all_rows = []
+            processed_accessions = 0  # ✅ track successful accessions
+            email_tracked = False
+            log_lines = []
+            if not email.strip():
+                output_file_path = None#"Write your email so that you can download the outputs."
+                log_lines.append("📥 Provide your email to receive a downloadable Excel report and get 20 more free queries.")
+                limited_acc = 30
+            if email.strip():
+                usage_count, max_allowed = increment_usage(email, processed_accessions)
+                if int(usage_count) >= int(max_allowed):
+                    log_lines.append("❌ You have reached your quota. Please contact us to unlock more.")
+                    # Minimal blank yield to trigger UI rendering
+                    yield (
+                        make_html_table([]),
+                        gr.update(visible=True),
+                        gr.update(visible=False),
+                        gr.update(value="", visible=True),
+                        "⛔️ Quota limit",
+                        "⛔️ Quota limit"
+                    )
+                    # Actual warning frame
+                    yield (
+                        make_html_table([]),
+                        gr.update(visible=False),
+                        gr.update(visible=False),
+                        gr.update(value="❌ You have reached your quota. Please contact us to unlock more.", visible=True),
+                        "❌ Quota Exceeded",
+                        "\n".join(log_lines)
+                    )
+                    return
+                limited_acc = int(max_allowed-usage_count)
+            # Step 1: Parse input
+            accessions, error = extract_accessions_from_input(file, text)
+            print("🧪 Accessions received:", accessions)
+            if error:
+                yield (
+                    "",                          # output_table
+                    gr.update(visible=False),   # results_group
+                    gr.update(visible=False),   # download_file
+                    "",                          # usage_display
+                    "❌ Error",                  # status
+                    str(error)                  # progress_box
+                )
+                return
+            total = len(accessions)
+            if total > limited_acc:
+                accessions = accessions[:limited_acc]
+                warning = f"⚠️ Only processing first {limited_acc} accessions."
+            else:
+                warning = f"✅ All {total} accessions will be processed."
+            # all_rows = []
+            # processed_accessions = 0  # ✅ track successful accessions
+            # email_tracked = False
+            # log_lines = []
+            # if not email.strip():
+            #     output_file_path = None#"Write your email so that you can download the outputs."
+            #     log_lines.append("📥 Provide your email to receive a downloadable Excel report and get 20 more free queries.")
+            # if email.strip():
+            #     usage_count, max_allowed = increment_usage(email, processed_accessions)
+            #     if int(usage_count) > int(max_allowed):
+            #         log_lines.append("❌ You have reached your quota. Please contact us to unlock more.")
+            #         # Minimal blank yield to trigger UI rendering
+            #         yield (
+            #             make_html_table([]),
+            #             gr.update(visible=True),
+            #             gr.update(visible=False),
+            #             gr.update(value="", visible=True),
+            #             "⛔️ Quota limit",
+            #             "⛔️ Quota limit"
+            #         )
+            #         # Actual warning frame
+            #         yield (
+            #             make_html_table([]),
+            #             gr.update(visible=False),
+            #             gr.update(visible=False),
+            #             gr.update(value="❌ You have reached your quota. Please contact us to unlock more.", visible=True),
+            #             "❌ Quota Exceeded",
+            #             "\n".join(log_lines)
+            #         )
+            #         return
+            # Step 2: Loop through accessions
+            for i, acc in enumerate(accessions):
+                if global_stop_flag.value:
+                    log_lines.append(f"🛑 Stopped at {acc} ({i+1}/{total})")
+                    usage_text = ""
+                    if email.strip() and not email_tracked:
+                        print(f"🧪 increment_usage at STOP: {email=} {processed_accessions=}")
+                        usage_count, max_allowed = increment_usage(email, processed_accessions)
+                        email_tracked = True
+                        usage_text = f"**{usage_count}**/{max_allowed} allowed samples used by this email."
+                        #Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
+                    else:
+                        usage_text = f"The limited accession is 30. The user has used {processed_accessions}, and only {30 - processed_accessions} left."
+                    yield (
+                        make_html_table(all_rows),
+                        gr.update(visible=True),
+                        #gr.update(value=output_file_path, visible=True),
+                        gr.update(value=output_file_path, visible=bool(output_file_path)),
+                        gr.update(value=usage_text, visible=True),
+                        "🛑 Stopped",
+                        "\n".join(log_lines)
+                    )
+                    return
+                log_lines.append(f"[{i+1}/{total}] Processing {acc}")
+                yield (
+                    make_html_table(all_rows),
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    "",
+                    "⏳ Processing...",
+                    "\n".join(log_lines)
+                )
+                try:
+                    print("📄 Processing accession:", acc)
+                    rows = summarize_results(acc)
+                    all_rows.extend(rows)
+                    processed_accessions += 1  # ✅ only count success
+                    if email.strip():
+                        save_to_excel(all_rows, "", "", output_file_path, is_resume=False)
+                    log_lines.append(f"✅ Processed {acc} ({i+1}/{total})")
+                except Exception as e:
+                    log_lines.append(f"❌ Failed to process {acc}: {e}")
+                yield (
+                    make_html_table(all_rows),
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    "",
+                    "⏳ Processing...",
+                    "\n".join(log_lines)
+                )
+            # Step 3: Final usage update
+            usage_text = ""
+            if email.strip() and not email_tracked:
+                print(f"🧪 increment_usage at END: {email=} {processed_accessions=}")
+                usage_count, max_allowed = increment_usage(email, processed_accessions)
+                email_tracked = True
+                usage_text = f"**{usage_count}**/{max_allowed} allowed samples used by this email."
+                #Ten more samples are added first (you now have 60 limited accessions), then wait we will contact you via this email."
+            elif not email.strip():
+                usage_text = f"The limited accession is 30. The user has used {processed_accessions}, and only {30 - processed_accessions} left."
+            yield (
+                make_html_table(all_rows),
+                gr.update(visible=True),
+                #gr.update(value=output_file_path, visible=True),
+                gr.update(value=output_file_path, visible=bool(output_file_path)),
+                gr.update(value=usage_text, visible=True),
+                "✅ Done",
+                "\n".join(log_lines)
+            )
+        # SUBMIT REPORT UI
+        # 1. Google Sheets setup
+        def get_worksheet(sheet_name="Report"):
+            import os, json
+            import gspread
+            from oauth2client.service_account import ServiceAccountCredentials
+            try:
+                creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+                scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+                creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+                client = gspread.authorize(creds)
+                sheet = client.open(sheet_name).sheet1
+                return sheet
+            except Exception as e:
+                print(f"❌ Error loading Google Sheet '{sheet_name}':", e)
+                return None
+        # 2. Submit function to send report to the Google Sheet
+        def submit_report(report_text,user_email=""):
+            try:
+                sheet = get_worksheet()
+                # ✅ Parse the report_text (each line like 'ACCESSION: message')
+                lines = report_text.strip().split('\n')
+                user = ""
+                if user_email.strip():
+                    user = user_email
+                for line in lines:
+                    if ':' in line:
+                        accession, message = line.split(':', 1)
+                        sheet.append_row([accession.strip(), message.strip(), user.strip()])
+                return "✅ Report submitted successfully!"
+            except Exception as e:
+                return f"❌ Error submitting report: {str(e)}"
+        def show_report_ui():
+            return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
+        def handle_submission(text,user_email):
+            msg = submit_report(text, user_email)
+            return gr.update(value=msg, visible=True), gr.update(visible=False), gr.update(visible=False)
+        # def threaded_batch_runner(file=None, text="", email=""):
+        #   global_stop_flag.value = False
+        #   # Dummy test output that matches expected schema
+        #   return (
+        #       "<div>✅ Dummy output table</div>",   # HTML string
+        #       gr.update(visible=True),             # Group visibility
+        #       gr.update(visible=False),            # Download file
+        #       "**0** samples used.",               # Markdown
+        #       "✅ Done",                           # Status string
+        #       "Processing finished."               # Progress string
+        #   )
+        # def classify_mulAcc(file, text, resume, email, log_callback=None, log_collector=None):
+        #     stop_flag.value = False
+        #     return threaded_batch_runner(file, text, resume, email, status, stop_flag, log_callback=log_callback, log_collector=log_collector)
+        def make_html_table(rows):
+          # html = """
+          # <div style='overflow-x: auto; padding: 10px;'>
+          #     <div style='max-height: 400px; overflow-y: auto; border: 1px solid #444; border-radius: 8px;'>
+          #         <table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: #f1f1f1; background-color: #1e1e1e;'>
+          #             <thead style='position: sticky; top: 0; background-color: #2c2c2c; z-index: 1;'>
+          #                 <tr>
+          # """
+          html = """
+            <div style='overflow-x: auto; padding: 10px;'>
+                <div style='max-height: 400px; overflow-y: auto; border: 1px solid #ccc; border-radius: 8px;'>
+                    <table style='width:100%; border-collapse: collapse; table-layout: auto; font-size: 14px; color: inherit; background-color: inherit;'>
+            """
+          headers = ["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"]
+          html += "".join(
+              f"<th style='padding: 10px; border: 1px solid #555; text-align: left; white-space: nowrap;'>{h}</th>"
+              for h in headers
+          )
+          html += "</tr></thead><tbody>"
+          for row in rows:
+              html += "<tr>"
+              for i, col in enumerate(row):
+                  header = headers[i]
+                  style = "padding: 10px; border: 1px solid #555; vertical-align: top;"
+                  # For specific columns like Haplogroup, force nowrap
+                  if header in ["Country Explanation", "Sample Type Explanation"]:
+                    style += " max-width: 400px; word-wrap: break-word; white-space: normal;"
+                  elif header in ["Sample ID", "Predicted Country", "Predicted Sample Type", "Time cost"]:
+                      style += " white-space: nowrap; text-overflow: ellipsis; max-width: 200px; overflow: hidden;"
+                  # if header == "Sources" and isinstance(col, str) and col.strip().lower().startswith("http"):
+                  #     col = f"<a href='{col}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{col}</a>"
+                  #html += f"<td style='{style}'>{col}</td>"
+                  if header == "Sources" and isinstance(col, str):
+                      links = [f"<a href='{url.strip()}' target='_blank' style='color: #4ea1f3; text-decoration: underline;'>{url.strip()}</a>" for url in col.strip().split("\n") if url.strip()]
+                      col = "- "+"<br>- ".join(links)
+                  elif isinstance(col, str):
+                      # lines = []
+                      # for line in col.split("\n"):
+                      #     line = line.strip()
+                      #     if not line:
+                      #         continue
+                      #     if line.lower().startswith("rag_llm-"):
+                      #         content = line[len("rag_llm-"):].strip()
+                      #         line = f"{content} (Method: RAG_LLM)"
+                      #     lines.append(f"- {line}")
+                      col = col.replace("\n", "<br>")
+                      #col = col.replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
+                      #col = "<br>".join(lines)
+                  html += f"<td style='{style}'>{col}</td>"
+              html += "</tr>"
+          html += "</tbody></table></div></div>"
+          return html
+        # def reset_fields():
+        #     global_stop_flag.value = False  # 💡 Add this to reset the flag
+        #     return (
+        #         #gr.update(value=""),  # single_accession
+        #         gr.update(value=""),  # raw_text
+        #         gr.update(value=None), # file_upload
+        #         #gr.update(value=None),  # resume_file
+        #         #gr.update(value="Single Accession"), # inputMode
+        #         gr.update(value=[], visible=True), # output_table
+        #         # gr.update(value="", visible=True), # output_summary
+        #         # gr.update(value="", visible=True), # output_flag
+        #         gr.update(visible=False), # status
+        #         gr.update(visible=False),  # results_group
+        #         gr.update(value="", visible=False),  # usage_display
+        #         gr.update(value="", visible=False),  # progress_box
+        #     )
+        def reset_fields():
+          global_stop_flag.value = False  # Reset the stop flag
+          return (
+              gr.update(value=""),          # raw_text
+              gr.update(value=None),        # file_upload
+              gr.update(value=[], visible=True),   # output_table
+              gr.update(value="", visible=True),   # status — reset and make visible again
+              gr.update(visible=False),     # results_group
+              gr.update(value="", visible=True),   # usage_display — reset and make visible again
+              gr.update(value="", visible=True),   # progress_box — reset AND visible!
+              # report-related reset below
+              gr.update(value="", visible=False),  # report_textbox
+              gr.update(visible=False),            # submit_report_button
+              gr.update(value="", visible=False),  # status_report
+            )
+        #inputMode.change(fn=toggle_input_mode, inputs=inputMode, outputs=[single_input_group, batch_input_group])
+        #run_button.click(fn=classify_with_loading, inputs=[], outputs=[status])
+        # run_button.click(
+        #     fn=classify_dynamic,
+        #     inputs=[single_accession, file_upload, raw_text, resume_file,user_email,inputMode],
+        #     outputs=[output_table,
+        #     #output_summary, output_flag,
+        #     results_group, download_file, usage_display,status, progress_box]
+        # )
+        # run_button.click(
+        #     fn=threaded_batch_runner,
+        #     #inputs=[file_upload, raw_text, resume_file, user_email],
+        #     inputs=[file_upload, raw_text, user_email],
+        #     outputs=[output_table, results_group, download_file, usage_display, status, progress_box]
+        # )
+    #     run_button.click(
+    #     fn=threaded_batch_runner,
+    #     inputs=[file_upload, raw_text, user_email],
+    #     outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
+    #     every=0.5  # <-- this tells Gradio to expect streaming
+    # )
+        # output_table = gr.HTML()
+        # results_group = gr.Group(visible=False)
+        # download_file = gr.File(visible=False)
+        # usage_display = gr.Markdown(visible=False)
+        # status = gr.Markdown(visible=False)
+        # progress_box = gr.Textbox(visible=False)
+    #     run_button.click(
+    #     fn=threaded_batch_runner,
+    #     inputs=[file_upload, raw_text, user_email],
+    #     outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
+    #     every=0.5,  # streaming enabled
+    #     show_progress="full"
+    # )
+        # interface.stream(
+        #     fn=threaded_batch_runner,
+        #     inputs=[file_upload, raw_text, user_email],
+        #     outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
+        #     trigger=run_button,
+        #     every=0.5,
+        #     show_progress="full",
+        # )
+        interface.queue()  # No arguments here!
+        run_button.click(
+            fn=threaded_batch_runner,
+            inputs=[file_upload, raw_text, user_email],
+            outputs=[output_table, results_group, download_file, usage_display, status, progress_box],
+            concurrency_limit=1,  # ✅ correct in Gradio 5.x
+            queue=True,            # ✅ ensure the queue is used
+            #every=0.5
+        )
+        stop_button.click(fn=stop_batch, inputs=[], outputs=[status])
+        # reset_button.click(
+        #     #fn=reset_fields,
+        #     fn=lambda: (
+        #         gr.update(value=""), gr.update(value=""), gr.update(value=None), gr.update(value=None), gr.update(value="Single Accession"),
+        #         gr.update(value=[], visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False), gr.update(value="", visible=False)
+        #     ),
+        #     inputs=[],
+        #     outputs=[
+        #         single_accession, raw_text, file_upload, resume_file,inputMode,
+        #         output_table,# output_summary, output_flag,
+        #         status, results_group, usage_display, progress_box
+        #     ]
+        # )
+        #stop_button.click(fn=lambda sf: (gr.update(value="❌ Stopping...", visible=True), setattr(sf, "value", True) or sf), inputs=[gr.State(stop_flag)], outputs=[status, gr.State(stop_flag)])
+        reset_button.click(
+            fn=reset_fields,
+            inputs=[],
+            #outputs=[raw_text, file_upload, resume_file, output_table, status, results_group, usage_display, progress_box]
+            outputs=[raw_text, file_upload, output_table, status, results_group, usage_display, progress_box,
+                    report_textbox,
+                    submit_report_button,
+                    status_report]
+                    )
+        # download_button.click(
+        #   fn=mtdna_backend.save_batch_output,
+        #   #inputs=[output_table, output_summary, output_flag, output_type],
+        #   inputs=[output_table, output_type],
+        #   outputs=[download_file])
+        # submit_feedback.click(
+        #     fn=mtdna_backend.store_feedback_to_google_sheets,
+        #     inputs=[single_accession, q1, q2, contact], outputs=feedback_status
+        # )
+        report_button.click(fn=show_report_ui, outputs=[report_textbox, submit_report_button, status_report])
+        submit_report_button.click(fn=handle_submission, inputs=[report_textbox, user_email], outputs=[status_report, report_textbox, submit_report_button])
+        submit_feedback.click(
+            fn=mtdna_backend.store_feedback_to_google_sheets,
+            inputs=[raw_text, q1, q2, contact],
+            outputs=[feedback_status]
+        )
+        gr.HTML("""
+        <style>
+          body, html {
+              background-color: #121212 !important;
+              color: #ffffff !important;
+          }
+          .gradio-container, .gr-block, .gr-box, textarea, input, select, .prose, .prose * {
+              background-color: #1e1e1e !important;
+              color: #ffffff !important;
+              border-color: #333 !important;
+          }
+          textarea::placeholder,
+          input::placeholder {
+              color: #aaa !important;
+          }
+          button {
+              background-color: #2d2d2d !important;
+              color: #fff !important;
+              border: 1px solid #444 !important;
+          }
+          a {
+              color: #4ea1f3 !important;
+          }
+        </style>
+        """)
+        #     # Custom CSS styles
+        # gr.HTML("""
+        # <style>
+        #   /* Ensures both sections are equally spaced with the same background size */
+        #   #output-summary, #output-flag {
+        #       background-color: #f0f4f8; /* Light Grey for both */
+        #       padding: 20px;
+        #       border-radius: 10px;
+        #       margin-top: 10px;
+        #       width: 100%; /* Ensure full width */
+        #       min-height: 150px; /* Ensures both have a minimum height */
+        #       box-sizing: border-box; /* Prevents padding from increasing size */
+        #       display: flex;
+        #       flex-direction: column;
+        #       justify-content: space-between;
+        #   }
+        #   /* Specific background colors */
+        #   #output-summary {
+        #       background-color: #434a4b;
+        #   }
+        #   #output-flag {
+        #       background-color: #141616;
+        #   }
+        #   /* Ensuring they are in a row and evenly spaced */
+        #   .gradio-row {
+        #       display: flex;
+        #       justify-content: space-between;
+        #       width: 100%;
+        #   }
+        # </style>
+        # """)
 interface.launch(share=True,debug=True)

data_preprocess.py CHANGED Viewed

@@ -1,669 +1,746 @@
-import re
-import os
-#import streamlit as st
-import subprocess
-import re
-from Bio import Entrez
-from docx import Document
-import fitz
-import spacy
-from spacy.cli import download
-from NER.PDF import pdf
-from NER.WordDoc import wordDoc
-from NER.html import extractHTML
-from NER.word2Vec import word2vec
-from transformers import pipeline
-import urllib.parse, requests
-from pathlib import Path
-import pandas as pd
-import model
-import pipeline
-import tempfile
-import nltk
-nltk.download('punkt_tab')
-def download_excel_file(url, save_path="temp.xlsx"):
-    if "view.officeapps.live.com" in url:
-        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
-        real_url = urllib.parse.unquote(parsed_url["src"][0])
-        response = requests.get(real_url)
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-        return save_path
-    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
-        response = requests.get(url)
-        response.raise_for_status()  # Raises error if download fails
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-            print(len(response.content))
-        return save_path
-    else:
-        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
-        return url
-def extract_text(link,saveFolder):
-  text = ""
-  name = link.split("/")[-1]
-  #file_path = Path(saveFolder) / name
-  local_temp_path = os.path.join(tempfile.gettempdir(), name)
-  pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
-  # pdf
-  if link.endswith(".pdf"):
-    # if file_path.is_file():
-    #   link = saveFolder + "/" + name
-    #   print("File exists.")
-    p = pdf.PDF(local_temp_path, saveFolder)
-    #p = pdf.PDF(link,saveFolder)
-    text = p.extractTextWithPDFReader()
-    #text_exclude_table = p.extract_text_excluding_tables()
-  # worddoc
-  elif link.endswith(".doc") or link.endswith(".docx"):
-    d = wordDoc.wordDoc(local_temp_path,saveFolder)
-    text = d.extractTextByPage()
-  # html
-  if link.split(".")[-1].lower() not in "xlsx":
-    if "http" in link or "html" in link:
-      html = extractHTML.HTML("",link)
-      text = html.getListSection() # the text already clean
-  return text
-def extract_table(link,saveFolder):
-  table = []
-  name = link.split("/")[-1]
-  #file_path = Path(saveFolder) / name
-  local_temp_path = os.path.join(tempfile.gettempdir(), name)
-  pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
-  # pdf
-  if link.endswith(".pdf"):
-    # if file_path.is_file():
-    #   link = saveFolder + "/" + name
-    #   print("File exists.")
-    p = pdf.PDF(local_temp_path,saveFolder)
-    table = p.extractTable()
-  # worddoc
-  elif link.endswith(".doc") or link.endswith(".docx"):
-    d = wordDoc.wordDoc(local_temp_path,saveFolder)
-    table = d.extractTableAsList()
-  # excel
-  elif link.split(".")[-1].lower() in "xlsx":
-    # download excel file if it not downloaded yet
-    savePath = saveFolder +"/"+ link.split("/")[-1]
-    excelPath = download_excel_file(link, savePath)
-    try:
-        #xls = pd.ExcelFile(excelPath)
-        xls = pd.ExcelFile(local_temp_path)
-        table_list = []
-        for sheet_name in xls.sheet_names:
-            df = pd.read_excel(xls, sheet_name=sheet_name)
-            cleaned_table = df.fillna("").astype(str).values.tolist()
-            table_list.append(cleaned_table)
-        table = table_list
-    except Exception as e:
-        print("❌ Failed to extract tables from Excel:", e)
-  # html
-  elif "http" in link or "html" in link:
-    html = extractHTML.HTML("",link)
-    table = html.extractTable() # table is a list
-  table = clean_tables_format(table)
-  return table
-def clean_tables_format(tables):
-    """
-    Ensures all tables are in consistent format: List[List[List[str]]]
-    Cleans by:
-    - Removing empty strings and rows
-    - Converting all cells to strings
-    - Handling DataFrames and list-of-lists
-    """
-    cleaned = []
-    if tables:
-      for table in tables:
-          standardized = []
-          # Case 1: Pandas DataFrame
-          if isinstance(table, pd.DataFrame):
-              table = table.fillna("").astype(str).values.tolist()
-          # Case 2: List of Lists
-          if isinstance(table, list) and all(isinstance(row, list) for row in table):
-              for row in table:
-                  filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
-                  if filtered_row:
-                      standardized.append(filtered_row)
-          if standardized:
-              cleaned.append(standardized)
-    return cleaned
-import json
-def normalize_text_for_comparison(s: str) -> str:
-    """
-    Normalizes text for robust comparison by:
-    1. Converting to lowercase.
-    2. Replacing all types of newlines with a single consistent newline (\n).
-    3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
-    4. Stripping leading/trailing whitespace from the entire string.
-    """
-    s = s.lower()
-    s = s.replace('\r\n', '\n') # Handle Windows newlines
-    s = s.replace('\r', '\n')   # Handle Mac classic newlines
-    # Replace sequences of whitespace (including multiple newlines) with a single space
-    # This might be too aggressive if you need to preserve paragraph breaks,
-    # but good for exact word-sequence matching.
-    s = re.sub(r'\s+', ' ', s)
-    return s.strip()
-def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
-    """
-    Merge cleaned text and table into one string for LLM input.
-    - Avoids duplicating tables already in text
-    - Extracts only relevant rows from large tables
-    - Skips or saves oversized tables
-    """
-    import importlib
-    json = importlib.import_module("json")
-    def estimate_tokens(text_str):
-        try:
-            enc = tiktoken.get_encoding(tokenizer)
-            return len(enc.encode(text_str))
-        except:
-            return len(text_str) // 4  # Fallback estimate
-    def is_table_relevant(table, keywords, accession_id=None):
-        flat = " ".join(" ".join(row).lower() for row in table)
-        if accession_id and accession_id.lower() in flat:
-            return True
-        return any(kw.lower() in flat for kw in keywords)
-    preview, preview1 = "",""
-    llm_input = "## Document Text\n" + text.strip() + "\n"
-    clean_text = normalize_text_for_comparison(text)
-    if tables:
-        for idx, table in enumerate(tables):
-          keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
-          if accession_id:  keywords += [accession_id.lower()]
-          if isolate: keywords += [isolate.lower()]
-          if is_table_relevant(table, keywords, accession_id):
-            if len(table) > 0:
-              for tab in table:
-                preview = " ".join(tab) if tab else ""
-                preview1 = "\n".join(tab) if tab else ""
-                clean_preview = normalize_text_for_comparison(preview)
-                clean_preview1 = normalize_text_for_comparison(preview1)
-                if clean_preview not in clean_text:
-                  if clean_preview1 not in clean_text:
-                    table_str = json.dumps([tab], indent=2)
-                    llm_input += f"## Table {idx+1}\n{table_str}\n"
-    return llm_input.strip()
-def preprocess_document(link, saveFolder, accession=None, isolate=None):
-    try:
-      text = extract_text(link, saveFolder)
-    except: text = ""
-    try:
-      tables = extract_table(link, saveFolder)
-    except: tables = []
-    if accession: accession = accession
-    if isolate: isolate = isolate
-    try:
-      final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
-    except: final_input = ""
-    return text, tables, final_input
-def extract_sentences(text):
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    return [s.strip() for s in sentences if s.strip()]
-def is_irrelevant_number_sequence(text):
-    if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
-        return False
-    word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
-    number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
-    total_tokens = len(re.findall(r'\S+', text))
-    if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
-        return True
-    elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
-        return True
-    return False
-def remove_isolated_single_digits(sentence):
-    tokens = sentence.split()
-    filtered_tokens = []
-    for token in tokens:
-        if token == '0' or token == '1':
-            pass
-        else:
-            filtered_tokens.append(token)
-    return ' '.join(filtered_tokens).strip()
-def get_contextual_sentences_BFS(text_content, keyword, depth=2):
-    def extract_codes(sentence):
-    # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
-      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
-    sentences = extract_sentences(text_content)
-    relevant_sentences = set()
-    initial_keywords = set()
-    # Define a regex to capture codes like A1YU101 or KM1
-    # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
-    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
-    # Attempt to parse the keyword into its prefix and numerical part using re.search
-    keyword_match = code_pattern.search(keyword)
-    keyword_prefix = None
-    keyword_num = None
-    if keyword_match:
-        keyword_prefix = keyword_match.group(1).lower()
-        keyword_num = int(keyword_match.group(2))
-    for sentence in sentences:
-        sentence_added = False
-        # 1. Check for exact match of the keyword
-        if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
-            relevant_sentences.add(sentence.strip())
-            initial_keywords.add(keyword.lower())
-            sentence_added = True
-        # 2. Check for range patterns (e.g., A1YU101-A1YU137)
-        # The range pattern should be broad enough to capture the full code string within the range.
-        range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
-        range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
-        for r_match in range_matches:
-            start_code_str = r_match.group(1)
-            end_code_str = r_match.group(2)
-            # CRITICAL FIX: Use code_pattern.search for start_match and end_match
-            start_match = code_pattern.search(start_code_str)
-            end_match = code_pattern.search(end_code_str)
-            if keyword_prefix and keyword_num is not None and start_match and end_match:
-                start_prefix = start_match.group(1).lower()
-                end_prefix = end_match.group(1).lower()
-                start_num = int(start_match.group(2))
-                end_num = int(end_match.group(2))
-                # Check if the keyword's prefix matches and its number is within the range
-                if keyword_prefix == start_prefix and \
-                   keyword_prefix == end_prefix and \
-                   start_num <= keyword_num <= end_num:
-                    relevant_sentences.add(sentence.strip())
-                    initial_keywords.add(start_code_str.lower())
-                    initial_keywords.add(end_code_str.lower())
-                    sentence_added = True
-                    break # Only need to find one matching range per sentence
-        # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
-        #    to initial_keywords to ensure graph traversal from related terms.
-        if sentence_added:
-          for word in extract_codes(sentence):
-            initial_keywords.add(word.lower())
-    # Build word_to_sentences mapping for all sentences
-    word_to_sentences = {}
-    for sent in sentences:
-      codes_in_sent = set(extract_codes(sent))
-      for code in codes_in_sent:
-          word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
-    # Build the graph
-    graph = {}
-    for sent in sentences:
-      codes = set(extract_codes(sent))
-      for word1 in codes:
-          word1_lower = word1.lower()
-          graph.setdefault(word1_lower, set())
-          for word2 in codes:
-              word2_lower = word2.lower()
-              if word1_lower != word2_lower:
-                  graph[word1_lower].add(word2_lower)
-    # Perform BFS/graph traversal
-    queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
-    visited_words = set(initial_keywords)
-    while queue:
-        current_word, level = queue.pop(0)
-        if level >= depth:
-            continue
-        relevant_sentences.update(word_to_sentences.get(current_word, []))
-        for neighbor in graph.get(current_word, []):
-            if neighbor not in visited_words:
-                visited_words.add(neighbor)
-                queue.append((neighbor, level + 1))
-    final_sentences = set()
-    for sentence in relevant_sentences:
-        if not is_irrelevant_number_sequence(sentence):
-            processed_sentence = remove_isolated_single_digits(sentence)
-            if processed_sentence:
-                final_sentences.add(processed_sentence)
-    return "\n".join(sorted(list(final_sentences)))
-def get_contextual_sentences_DFS(text_content, keyword, depth=2):
-    sentences = extract_sentences(text_content)
-    # Build word-to-sentences mapping
-    word_to_sentences = {}
-    for sent in sentences:
-        words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
-        for word in words_in_sent:
-            word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
-    # Function to extract codes in a sentence
-    def extract_codes(sentence):
-      # Only codes like 'KSK1', 'MG272794', not pure numbers
-      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
-    # DFS with priority based on distance to keyword and early stop if country found
-    def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
-        country = "unknown"
-        if current_depth > max_depth:
-            return country, False
-        if current_word not in word_to_sentences:
-            return country, False
-        for sentence in word_to_sentences[current_word]:
-            if sentence == parent_sentence:
-                continue  # avoid reusing the same sentence
-            collected_sentences.add(sentence)
-            #print("current_word:", current_word)
-            small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
-            #print(small_sen)
-            country = model.get_country_from_text(small_sen)
-            #print("small context country:", country)
-            if country.lower() != "unknown":
-                return country, True
-            else:
-                country = model.get_country_from_text(sentence)
-                #print("full sentence country:", country)
-                if country.lower() != "unknown":
-                    return country, True
-            codes_in_sentence = extract_codes(sentence)
-            idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
-            if idx is None:
-                continue
-            sorted_children = sorted(
-                [code for code in codes_in_sentence if code.lower() not in visited_words],
-                key=lambda x: (abs(codes_in_sentence.index(x) - idx),
-                               0 if codes_in_sentence.index(x) > idx else 1)
-            )
-            #print("sorted_children:", sorted_children)
-            for child in sorted_children:
-                child_lower = child.lower()
-                if child_lower not in visited_words:
-                    visited_words.add(child_lower)
-                    country, should_stop = dfs_traverse(
-                        child_lower, current_depth + 1, max_depth,
-                        visited_words, collected_sentences, parent_sentence=sentence
-                    )
-                    if should_stop:
-                        return country, True
-        return country, False
-    # Begin DFS
-    collected_sentences = set()
-    visited_words = set([keyword.lower()])
-    country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
-    # Filter irrelevant sentences
-    final_sentences = set()
-    for sentence in collected_sentences:
-        if not is_irrelevant_number_sequence(sentence):
-            processed = remove_isolated_single_digits(sentence)
-            if processed:
-                final_sentences.add(processed)
-    if not final_sentences:
-      return country, text_content
-    return country, "\n".join(sorted(list(final_sentences)))
-# Helper function for normalizing text for overlap comparison
-def normalize_for_overlap(s: str) -> str:
-    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
-    s = re.sub(r'\s+', ' ', s).strip()
-    return s
-def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
-    if not text1: return text2
-    if not text2: return text1
-    # Case 1: text2 is fully contained in text1 or vice-versa
-    if text2 in text1:
-        return text1
-    if text1 in text2:
-        return text2
-    # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
-    # This is what your function was primarily designed for.
-    # It looks for the overlap at the "junction" of text1 and text2.
-    max_junction_overlap = 0
-    for i in range(min(len(text1), len(text2)), 0, -1):
-        suffix1 = text1[-i:]
-        prefix2 = text2[:i]
-        # Prioritize exact match, then normalized match
-        if suffix1 == prefix2:
-            max_junction_overlap = i
-            break
-        elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
-            max_junction_overlap = i
-            break # Take the first (longest) normalized match
-    if max_junction_overlap > 0:
-        merged_text = text1 + text2[max_junction_overlap:]
-        return re.sub(r'\s+', ' ', merged_text).strip()
-    # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
-    # This addresses your specific test case where the overlap is at the very beginning of both strings.
-    # This is often used when trying to deduplicate content that shares a common start.
-    longest_common_prefix_len = 0
-    min_len = min(len(text1), len(text2))
-    for i in range(min_len):
-        if text1[i] == text2[i]:
-            longest_common_prefix_len = i + 1
-        else:
-            break
-    # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
-    # AND the remaining parts are distinct, then apply this merge.
-    # This is a heuristic and might need fine-tuning.
-    if longest_common_prefix_len > 0 and \
-       text1[longest_common_prefix_len:].strip() and \
-       text2[longest_common_prefix_len:].strip():
-        # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
-        # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
-        # common prefix is "Hi, I am Vy."
-        # Remaining text1: " Nice to meet you."
-        # Remaining text2: " Goodbye Vy."
-        # So we merge common_prefix + remaining_text1 + remaining_text2
-        common_prefix_str = text1[:longest_common_prefix_len]
-        remainder_text1 = text1[longest_common_prefix_len:]
-        remainder_text2 = text2[longest_common_prefix_len:]
-        merged_text = common_prefix_str + remainder_text1 + remainder_text2
-        return re.sub(r'\s+', ' ', merged_text).strip()
-    # If neither specific overlap type is found, just concatenate
-    merged_text = text1 + text2
-    return re.sub(r'\s+', ' ', merged_text).strip()
-from docx import Document
-from pipeline import upload_file_to_drive
-# def save_text_to_docx(text_content: str, file_path: str):
-#     """
-#     Saves a given text string into a .docx file.
-#     Args:
-#         text_content (str): The text string to save.
-#         file_path (str): The full path including the filename where the .docx file will be saved.
-#                          Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
-#     """
-#     try:
-#         document = Document()
-#         # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
-#         for paragraph_text in text_content.split('\n'):
-#             document.add_paragraph(paragraph_text)
-#         document.save(file_path)
-#         print(f"Text successfully saved to '{file_path}'")
-#     except Exception as e:
-#         print(f"Error saving text to docx file: {e}")
-# def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
-#     """
-#     Saves a given text string into a .docx file locally, then uploads to Google Drive.
-#     Args:
-#         text_content (str): The text string to save.
-#         filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
-#         drive_folder_id (str): Google Drive folder ID where to upload the file.
-#     """
-#     try:
-#         # ✅ Save to temporary local path first
-#         print("file name: ", filename)
-#         print("length text content: ", len(text_content))
-#         local_path = os.path.join(tempfile.gettempdir(), filename)
-#         document = Document()
-#         for paragraph_text in text_content.split('\n'):
-#             document.add_paragraph(paragraph_text)
-#         document.save(local_path)
-#         print(f"✅ Text saved locally to: {local_path}")
-#         # ✅ Upload to Drive
-#         pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
-#         print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
-#     except Exception as e:
-#         print(f"❌ Error saving or uploading DOCX: {e}")
-def save_text_to_docx(text_content: str, full_local_path: str):
-    document = Document()
-    for paragraph_text in text_content.split('\n'):
-        document.add_paragraph(paragraph_text)
-    document.save(full_local_path)
-    print(f"✅ Saved DOCX locally: {full_local_path}")
-'''2 scenerios:
-- quick look then found then deepdive and directly get location then stop
-- quick look then found then deepdive but not find location then hold the related words then
-look another files iteratively for each related word and find location and stop'''
-def extract_context(text, keyword, window=500):
-    # firstly try accession number
-    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
-    # Attempt to parse the keyword into its prefix and numerical part using re.search
-    keyword_match = code_pattern.search(keyword)
-    keyword_prefix = None
-    keyword_num = None
-    if keyword_match:
-        keyword_prefix = keyword_match.group(1).lower()
-        keyword_num = int(keyword_match.group(2))
-    text = text.lower()
-    idx = text.find(keyword.lower())
-    if idx == -1:
-      if keyword_prefix:
-        idx = text.find(keyword_prefix)
-      if idx == -1:
-        return "Sample ID not found."
-      return text[max(0, idx-window): idx+window]
-    return text[max(0, idx-window): idx+window]
-def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
-  cache = {}
-  country = "unknown"
-  output = ""
-  tem_output, small_output = "",""
-  keyword_appear = (False,"")
-  keywords = []
-  if isolate: keywords.append(isolate)
-  if accession: keywords.append(accession)
-  for f in filePaths:
-    # scenerio 1: direct location: truncate the context and then use qa model?
-    if keywords:
-      for keyword in keywords:
-        text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
-        if keyword in final_input:
-          context = extract_context(final_input, keyword)
-          # quick look if country already in context and if yes then return
-          country = model.get_country_from_text(context)
-          if country != "unknown":
-            return country, context, final_input
-          else:
-            country = model.get_country_from_text(final_input)
-            if country != "unknown":
-              return country, context, final_input
-            else: # might be cross-ref
-              keyword_appear = (True, f)
-              cache[f] = context
-              small_output = merge_texts_skipping_overlap(output, context) + "\n"
-              chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
-              countryBFS = model.get_country_from_text(chunkBFS)
-              countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
-              output = merge_texts_skipping_overlap(output, final_input)
-              if countryDFS != "unknown" and countryBFS != "unknown":
-                if len(chunkDFS) <= len(chunkBFS):
-                  return countryDFS, chunkDFS, output
-                else:
-                  return countryBFS, chunkBFS, output
-              else:
-                if countryDFS != "unknown":
-                  return countryDFS, chunkDFS, output
-                if countryBFS != "unknown":
-                  return countryBFS, chunkBFS, output
-        else:
-        # scenerio 2:
-          '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
-          but if we look at file 1 first then maybe we can have lookup dict which country
-          such as Thailand as the key and its re'''
-          cache[f] = final_input
-          if keyword_appear[0] == True:
-            for c in cache:
-              if c!=keyword_appear[1]:
-                if cache[c].lower() not in output.lower():
-                  output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
-                  chunkBFS = get_contextual_sentences_BFS(output, keyword)
-                  countryBFS = model.get_country_from_text(chunkBFS)
-                  countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
-                  if countryDFS != "unknown" and countryBFS != "unknown":
-                    if len(chunkDFS) <= len(chunkBFS):
-                      return countryDFS, chunkDFS, output
-                    else:
-                      return countryBFS, chunkBFS, output
-                  else:
-                    if countryDFS != "unknown":
-                      return countryDFS, chunkDFS, output
-                    if countryBFS != "unknown":
-                      return countryBFS, chunkBFS, output
-          else:
-            if cache[f].lower() not in output.lower():
-              output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
-  if len(output) == 0 or keyword_appear[0]==False:
-    for c in cache:
-      if cache[c].lower() not in output.lower():
-        output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
   return country, "", output

+import re
+import os
+#import streamlit as st
+import subprocess
+import re
+from Bio import Entrez
+from docx import Document
+import fitz
+import spacy
+from spacy.cli import download
+from NER.PDF import pdf
+from NER.WordDoc import wordDoc
+from NER.html import extractHTML
+from NER.word2Vec import word2vec
+#from transformers import pipeline
+import urllib.parse, requests
+from pathlib import Path
+import pandas as pd
+import model
+import pipeline
+import tempfile
+import nltk
+nltk.download('punkt_tab')
+def download_excel_file(url, save_path="temp.xlsx"):
+    if "view.officeapps.live.com" in url:
+        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+        real_url = urllib.parse.unquote(parsed_url["src"][0])
+        response = requests.get(real_url)
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
+        response = requests.get(url)
+        response.raise_for_status()  # Raises error if download fails
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+            print(len(response.content))
+        return save_path
+    else:
+        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
+        return url
+def extract_text(link,saveFolder):
+  try:
+      text = ""
+      name = link.split("/")[-1]
+      print("name: ", name)
+      #file_path = Path(saveFolder) / name
+      local_temp_path = os.path.join(tempfile.gettempdir(), name)
+      print("this is local temp path: ", local_temp_path)
+      if os.path.exists(local_temp_path):
+        input_to_class = local_temp_path
+        print("exist")
+      else:
+        #input_to_class = link  # Let the class handle downloading
+        # 1. Check if file exists in shared Google Drive folder
+        file_id = pipeline.find_drive_file(name, saveFolder)
+        if file_id:
+            print("📥 Downloading from Google Drive...")
+            pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+        else:
+            print("🌐 Downloading from web link...")
+            response = requests.get(link)
+            with open(local_temp_path, 'wb') as f:
+                f.write(response.content)
+            print("✅ Saved locally.")
+            # 2. Upload to Drive so it's available for later
+            pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
+        input_to_class = local_temp_path
+        print(input_to_class)
+      # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+      # pdf
+      if link.endswith(".pdf"):
+        # if file_path.is_file():
+        #   link = saveFolder + "/" + name
+        #   print("File exists.")
+        #p = pdf.PDF(local_temp_path, saveFolder)
+        print("inside pdf and input to class: ", input_to_class)
+        print("save folder in extract text: ", saveFolder)
+        p = pdf.PDF(input_to_class, saveFolder)
+        #p = pdf.PDF(link,saveFolder)
+        text = p.extractTextWithPDFReader()
+        print("text from pdf:")
+        print(text)
+        #text_exclude_table = p.extract_text_excluding_tables()
+      # worddoc
+      elif link.endswith(".doc") or link.endswith(".docx"):
+        #d = wordDoc.wordDoc(local_temp_path,saveFolder)
+        d = wordDoc.wordDoc(input_to_class,saveFolder)
+        text = d.extractTextByPage()
+      # html
+      else:
+        if link.split(".")[-1].lower() not in "xlsx":
+            if "http" in link or "html" in link:
+              print("html link: ", link)
+              html = extractHTML.HTML("",link)
+              text = html.getListSection() # the text already clean
+              print("text html: ")
+              print(text)
+      # Cleanup: delete the local temp file
+      if name:
+          if os.path.exists(local_temp_path):
+            os.remove(local_temp_path)
+            print(f"🧹 Deleted local temp file: {local_temp_path}")
+      print("done extract text")
+  except:
+      text = ""
+  return text
+def extract_table(link,saveFolder):
+  try:
+      table = []
+      name = link.split("/")[-1]
+      #file_path = Path(saveFolder) / name
+      local_temp_path = os.path.join(tempfile.gettempdir(), name)
+      if os.path.exists(local_temp_path):
+        input_to_class = local_temp_path
+        print("exist")
+      else:
+        #input_to_class = link  # Let the class handle downloading
+        # 1. Check if file exists in shared Google Drive folder
+        file_id = pipeline.find_drive_file(name, saveFolder)
+        if file_id:
+            print("📥 Downloading from Google Drive...")
+            pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+        else:
+            print("🌐 Downloading from web link...")
+            response = requests.get(link)
+            with open(local_temp_path, 'wb') as f:
+                f.write(response.content)
+            print("✅ Saved locally.")
+            # 2. Upload to Drive so it's available for later
+            pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
+        input_to_class = local_temp_path
+        print(input_to_class)
+      #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+      # pdf
+      if link.endswith(".pdf"):
+        # if file_path.is_file():
+        #   link = saveFolder + "/" + name
+        #   print("File exists.")
+        #p = pdf.PDF(local_temp_path,saveFolder)
+        p = pdf.PDF(input_to_class,saveFolder)
+        table = p.extractTable()
+      # worddoc
+      elif link.endswith(".doc") or link.endswith(".docx"):
+        #d = wordDoc.wordDoc(local_temp_path,saveFolder)
+        d = wordDoc.wordDoc(input_to_class,saveFolder)
+        table = d.extractTableAsList()
+      # excel
+      elif link.split(".")[-1].lower() in "xlsx":
+        # download excel file if it not downloaded yet
+        savePath = saveFolder +"/"+ link.split("/")[-1]
+        excelPath = download_excel_file(link, savePath)
+        try:
+            #xls = pd.ExcelFile(excelPath)
+            xls = pd.ExcelFile(local_temp_path)
+            table_list = []
+            for sheet_name in xls.sheet_names:
+                df = pd.read_excel(xls, sheet_name=sheet_name)
+                cleaned_table = df.fillna("").astype(str).values.tolist()
+                table_list.append(cleaned_table)
+            table = table_list
+        except Exception as e:
+            print("❌ Failed to extract tables from Excel:", e)
+      # html
+      elif "http" in link or "html" in link:
+        html = extractHTML.HTML("",link)
+        table = html.extractTable() # table is a list
+      table = clean_tables_format(table)
+      # Cleanup: delete the local temp file
+      if os.path.exists(local_temp_path):
+        os.remove(local_temp_path)
+        print(f"🧹 Deleted local temp file: {local_temp_path}")
+  except:
+      table = []
+  return table
+def clean_tables_format(tables):
+    """
+    Ensures all tables are in consistent format: List[List[List[str]]]
+    Cleans by:
+    - Removing empty strings and rows
+    - Converting all cells to strings
+    - Handling DataFrames and list-of-lists
+    """
+    cleaned = []
+    if tables:
+      for table in tables:
+          standardized = []
+          # Case 1: Pandas DataFrame
+          if isinstance(table, pd.DataFrame):
+              table = table.fillna("").astype(str).values.tolist()
+          # Case 2: List of Lists
+          if isinstance(table, list) and all(isinstance(row, list) for row in table):
+              for row in table:
+                  filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
+                  if filtered_row:
+                      standardized.append(filtered_row)
+          if standardized:
+              cleaned.append(standardized)
+    return cleaned
+import json
+def normalize_text_for_comparison(s: str) -> str:
+    """
+    Normalizes text for robust comparison by:
+    1. Converting to lowercase.
+    2. Replacing all types of newlines with a single consistent newline (\n).
+    3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
+    4. Stripping leading/trailing whitespace from the entire string.
+    """
+    s = s.lower()
+    s = s.replace('\r\n', '\n') # Handle Windows newlines
+    s = s.replace('\r', '\n')   # Handle Mac classic newlines
+    # Replace sequences of whitespace (including multiple newlines) with a single space
+    # This might be too aggressive if you need to preserve paragraph breaks,
+    # but good for exact word-sequence matching.
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
+    """
+    Merge cleaned text and table into one string for LLM input.
+    - Avoids duplicating tables already in text
+    - Extracts only relevant rows from large tables
+    - Skips or saves oversized tables
+    """
+    import importlib
+    json = importlib.import_module("json")
+    def estimate_tokens(text_str):
+        try:
+            enc = tiktoken.get_encoding(tokenizer)
+            return len(enc.encode(text_str))
+        except:
+            return len(text_str) // 4  # Fallback estimate
+    def is_table_relevant(table, keywords, accession_id=None):
+        flat = " ".join(" ".join(row).lower() for row in table)
+        if accession_id and accession_id.lower() in flat:
+            return True
+        return any(kw.lower() in flat for kw in keywords)
+    preview, preview1 = "",""
+    llm_input = "## Document Text\n" + text.strip() + "\n"
+    clean_text = normalize_text_for_comparison(text)
+    if tables:
+        for idx, table in enumerate(tables):
+          keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
+          if accession_id:  keywords += [accession_id.lower()]
+          if isolate: keywords += [isolate.lower()]
+          if is_table_relevant(table, keywords, accession_id):
+            if len(table) > 0:
+              for tab in table:
+                preview = " ".join(tab) if tab else ""
+                preview1 = "\n".join(tab) if tab else ""
+                clean_preview = normalize_text_for_comparison(preview)
+                clean_preview1 = normalize_text_for_comparison(preview1)
+                if clean_preview not in clean_text:
+                  if clean_preview1 not in clean_text:
+                    table_str = json.dumps([tab], indent=2)
+                    llm_input += f"## Table {idx+1}\n{table_str}\n"
+    return llm_input.strip()
+def preprocess_document(link, saveFolder, accession=None, isolate=None):
+    try:
+      text = extract_text(link, saveFolder)
+      print("text and link")
+      print(link)
+      print(text)
+    except: text = ""
+    try:
+      tables = extract_table(link, saveFolder)
+    except: tables = []
+    if accession: accession = accession
+    if isolate: isolate = isolate
+    try:
+      final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
+    except: final_input = ""
+    return text, tables, final_input
+def extract_sentences(text):
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    return [s.strip() for s in sentences if s.strip()]
+def is_irrelevant_number_sequence(text):
+    if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
+        return False
+    word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
+    number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
+    total_tokens = len(re.findall(r'\S+', text))
+    if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
+        return True
+    elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
+        return True
+    return False
+def remove_isolated_single_digits(sentence):
+    tokens = sentence.split()
+    filtered_tokens = []
+    for token in tokens:
+        if token == '0' or token == '1':
+            pass
+        else:
+            filtered_tokens.append(token)
+    return ' '.join(filtered_tokens).strip()
+def get_contextual_sentences_BFS(text_content, keyword, depth=2):
+    def extract_codes(sentence):
+    # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
+      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
+    sentences = extract_sentences(text_content)
+    relevant_sentences = set()
+    initial_keywords = set()
+    # Define a regex to capture codes like A1YU101 or KM1
+    # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
+    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
+    # Attempt to parse the keyword into its prefix and numerical part using re.search
+    keyword_match = code_pattern.search(keyword)
+    keyword_prefix = None
+    keyword_num = None
+    if keyword_match:
+        keyword_prefix = keyword_match.group(1).lower()
+        keyword_num = int(keyword_match.group(2))
+    for sentence in sentences:
+        sentence_added = False
+        # 1. Check for exact match of the keyword
+        if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
+            relevant_sentences.add(sentence.strip())
+            initial_keywords.add(keyword.lower())
+            sentence_added = True
+        # 2. Check for range patterns (e.g., A1YU101-A1YU137)
+        # The range pattern should be broad enough to capture the full code string within the range.
+        range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
+        range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
+        for r_match in range_matches:
+            start_code_str = r_match.group(1)
+            end_code_str = r_match.group(2)
+            # CRITICAL FIX: Use code_pattern.search for start_match and end_match
+            start_match = code_pattern.search(start_code_str)
+            end_match = code_pattern.search(end_code_str)
+            if keyword_prefix and keyword_num is not None and start_match and end_match:
+                start_prefix = start_match.group(1).lower()
+                end_prefix = end_match.group(1).lower()
+                start_num = int(start_match.group(2))
+                end_num = int(end_match.group(2))
+                # Check if the keyword's prefix matches and its number is within the range
+                if keyword_prefix == start_prefix and \
+                   keyword_prefix == end_prefix and \
+                   start_num <= keyword_num <= end_num:
+                    relevant_sentences.add(sentence.strip())
+                    initial_keywords.add(start_code_str.lower())
+                    initial_keywords.add(end_code_str.lower())
+                    sentence_added = True
+                    break # Only need to find one matching range per sentence
+        # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
+        #    to initial_keywords to ensure graph traversal from related terms.
+        if sentence_added:
+          for word in extract_codes(sentence):
+            initial_keywords.add(word.lower())
+    # Build word_to_sentences mapping for all sentences
+    word_to_sentences = {}
+    for sent in sentences:
+      codes_in_sent = set(extract_codes(sent))
+      for code in codes_in_sent:
+          word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
+    # Build the graph
+    graph = {}
+    for sent in sentences:
+      codes = set(extract_codes(sent))
+      for word1 in codes:
+          word1_lower = word1.lower()
+          graph.setdefault(word1_lower, set())
+          for word2 in codes:
+              word2_lower = word2.lower()
+              if word1_lower != word2_lower:
+                  graph[word1_lower].add(word2_lower)
+    # Perform BFS/graph traversal
+    queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
+    visited_words = set(initial_keywords)
+    while queue:
+        current_word, level = queue.pop(0)
+        if level >= depth:
+            continue
+        relevant_sentences.update(word_to_sentences.get(current_word, []))
+        for neighbor in graph.get(current_word, []):
+            if neighbor not in visited_words:
+                visited_words.add(neighbor)
+                queue.append((neighbor, level + 1))
+    final_sentences = set()
+    for sentence in relevant_sentences:
+        if not is_irrelevant_number_sequence(sentence):
+            processed_sentence = remove_isolated_single_digits(sentence)
+            if processed_sentence:
+                final_sentences.add(processed_sentence)
+    return "\n".join(sorted(list(final_sentences)))
+def get_contextual_sentences_DFS(text_content, keyword, depth=2):
+    sentences = extract_sentences(text_content)
+    # Build word-to-sentences mapping
+    word_to_sentences = {}
+    for sent in sentences:
+        words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
+        for word in words_in_sent:
+            word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
+    # Function to extract codes in a sentence
+    def extract_codes(sentence):
+      # Only codes like 'KSK1', 'MG272794', not pure numbers
+      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
+    # DFS with priority based on distance to keyword and early stop if country found
+    def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
+        country = "unknown"
+        if current_depth > max_depth:
+            return country, False
+        if current_word not in word_to_sentences:
+            return country, False
+        for sentence in word_to_sentences[current_word]:
+            if sentence == parent_sentence:
+                continue  # avoid reusing the same sentence
+            collected_sentences.add(sentence)
+            #print("current_word:", current_word)
+            small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
+            #print(small_sen)
+            country = model.get_country_from_text(small_sen)
+            #print("small context country:", country)
+            if country.lower() != "unknown":
+                return country, True
+            else:
+                country = model.get_country_from_text(sentence)
+                #print("full sentence country:", country)
+                if country.lower() != "unknown":
+                    return country, True
+            codes_in_sentence = extract_codes(sentence)
+            idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
+            if idx is None:
+                continue
+            sorted_children = sorted(
+                [code for code in codes_in_sentence if code.lower() not in visited_words],
+                key=lambda x: (abs(codes_in_sentence.index(x) - idx),
+                               0 if codes_in_sentence.index(x) > idx else 1)
+            )
+            #print("sorted_children:", sorted_children)
+            for child in sorted_children:
+                child_lower = child.lower()
+                if child_lower not in visited_words:
+                    visited_words.add(child_lower)
+                    country, should_stop = dfs_traverse(
+                        child_lower, current_depth + 1, max_depth,
+                        visited_words, collected_sentences, parent_sentence=sentence
+                    )
+                    if should_stop:
+                        return country, True
+        return country, False
+    # Begin DFS
+    collected_sentences = set()
+    visited_words = set([keyword.lower()])
+    country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
+    # Filter irrelevant sentences
+    final_sentences = set()
+    for sentence in collected_sentences:
+        if not is_irrelevant_number_sequence(sentence):
+            processed = remove_isolated_single_digits(sentence)
+            if processed:
+                final_sentences.add(processed)
+    if not final_sentences:
+      return country, text_content
+    return country, "\n".join(sorted(list(final_sentences)))
+# Helper function for normalizing text for overlap comparison
+def normalize_for_overlap(s: str) -> str:
+    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s
+def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
+    if not text1: return text2
+    if not text2: return text1
+    # Case 1: text2 is fully contained in text1 or vice-versa
+    if text2 in text1:
+        return text1
+    if text1 in text2:
+        return text2
+    # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
+    # This is what your function was primarily designed for.
+    # It looks for the overlap at the "junction" of text1 and text2.
+    max_junction_overlap = 0
+    for i in range(min(len(text1), len(text2)), 0, -1):
+        suffix1 = text1[-i:]
+        prefix2 = text2[:i]
+        # Prioritize exact match, then normalized match
+        if suffix1 == prefix2:
+            max_junction_overlap = i
+            break
+        elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
+            max_junction_overlap = i
+            break # Take the first (longest) normalized match
+    if max_junction_overlap > 0:
+        merged_text = text1 + text2[max_junction_overlap:]
+        return re.sub(r'\s+', ' ', merged_text).strip()
+    # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
+    # This addresses your specific test case where the overlap is at the very beginning of both strings.
+    # This is often used when trying to deduplicate content that shares a common start.
+    longest_common_prefix_len = 0
+    min_len = min(len(text1), len(text2))
+    for i in range(min_len):
+        if text1[i] == text2[i]:
+            longest_common_prefix_len = i + 1
+        else:
+            break
+    # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
+    # AND the remaining parts are distinct, then apply this merge.
+    # This is a heuristic and might need fine-tuning.
+    if longest_common_prefix_len > 0 and \
+       text1[longest_common_prefix_len:].strip() and \
+       text2[longest_common_prefix_len:].strip():
+        # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
+        # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
+        # common prefix is "Hi, I am Vy."
+        # Remaining text1: " Nice to meet you."
+        # Remaining text2: " Goodbye Vy."
+        # So we merge common_prefix + remaining_text1 + remaining_text2
+        common_prefix_str = text1[:longest_common_prefix_len]
+        remainder_text1 = text1[longest_common_prefix_len:]
+        remainder_text2 = text2[longest_common_prefix_len:]
+        merged_text = common_prefix_str + remainder_text1 + remainder_text2
+        return re.sub(r'\s+', ' ', merged_text).strip()
+    # If neither specific overlap type is found, just concatenate
+    merged_text = text1 + text2
+    return re.sub(r'\s+', ' ', merged_text).strip()
+from docx import Document
+from pipeline import upload_file_to_drive
+# def save_text_to_docx(text_content: str, file_path: str):
+#     """
+#     Saves a given text string into a .docx file.
+#     Args:
+#         text_content (str): The text string to save.
+#         file_path (str): The full path including the filename where the .docx file will be saved.
+#                          Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
+#     """
+#     try:
+#         document = Document()
+#         # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
+#         for paragraph_text in text_content.split('\n'):
+#             document.add_paragraph(paragraph_text)
+#         document.save(file_path)
+#         print(f"Text successfully saved to '{file_path}'")
+#     except Exception as e:
+#         print(f"Error saving text to docx file: {e}")
+# def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
+#     """
+#     Saves a given text string into a .docx file locally, then uploads to Google Drive.
+#     Args:
+#         text_content (str): The text string to save.
+#         filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
+#         drive_folder_id (str): Google Drive folder ID where to upload the file.
+#     """
+#     try:
+#         # ✅ Save to temporary local path first
+#         print("file name: ", filename)
+#         print("length text content: ", len(text_content))
+#         local_path = os.path.join(tempfile.gettempdir(), filename)
+#         document = Document()
+#         for paragraph_text in text_content.split('\n'):
+#             document.add_paragraph(paragraph_text)
+#         document.save(local_path)
+#         print(f"✅ Text saved locally to: {local_path}")
+#         # ✅ Upload to Drive
+#         pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
+#         print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
+#     except Exception as e:
+#         print(f"❌ Error saving or uploading DOCX: {e}")
+def save_text_to_docx(text_content: str, full_local_path: str):
+    document = Document()
+    for paragraph_text in text_content.split('\n'):
+        document.add_paragraph(paragraph_text)
+    document.save(full_local_path)
+    print(f"✅ Saved DOCX locally: {full_local_path}")
+'''2 scenerios:
+- quick look then found then deepdive and directly get location then stop
+- quick look then found then deepdive but not find location then hold the related words then
+look another files iteratively for each related word and find location and stop'''
+def extract_context(text, keyword, window=500):
+    # firstly try accession number
+    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
+    # Attempt to parse the keyword into its prefix and numerical part using re.search
+    keyword_match = code_pattern.search(keyword)
+    keyword_prefix = None
+    keyword_num = None
+    if keyword_match:
+        keyword_prefix = keyword_match.group(1).lower()
+        keyword_num = int(keyword_match.group(2))
+    text = text.lower()
+    idx = text.find(keyword.lower())
+    if idx == -1:
+      if keyword_prefix:
+        idx = text.find(keyword_prefix)
+      if idx == -1:
+        return "Sample ID not found."
+      return text[max(0, idx-window): idx+window]
+    return text[max(0, idx-window): idx+window]
+def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
+  cache = {}
+  country = "unknown"
+  output = ""
+  tem_output, small_output = "",""
+  keyword_appear = (False,"")
+  keywords = []
+  if isolate: keywords.append(isolate)
+  if accession: keywords.append(accession)
+  for f in filePaths:
+    # scenerio 1: direct location: truncate the context and then use qa model?
+    if keywords:
+      for keyword in keywords:
+        text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
+        if keyword in final_input:
+          context = extract_context(final_input, keyword)
+          # quick look if country already in context and if yes then return
+          country = model.get_country_from_text(context)
+          if country != "unknown":
+            return country, context, final_input
+          else:
+            country = model.get_country_from_text(final_input)
+            if country != "unknown":
+              return country, context, final_input
+            else: # might be cross-ref
+              keyword_appear = (True, f)
+              cache[f] = context
+              small_output = merge_texts_skipping_overlap(output, context) + "\n"
+              chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
+              countryBFS = model.get_country_from_text(chunkBFS)
+              countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
+              output = merge_texts_skipping_overlap(output, final_input)
+              if countryDFS != "unknown" and countryBFS != "unknown":
+                if len(chunkDFS) <= len(chunkBFS):
+                  return countryDFS, chunkDFS, output
+                else:
+                  return countryBFS, chunkBFS, output
+              else:
+                if countryDFS != "unknown":
+                  return countryDFS, chunkDFS, output
+                if countryBFS != "unknown":
+                  return countryBFS, chunkBFS, output
+        else:
+        # scenerio 2:
+          '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
+          but if we look at file 1 first then maybe we can have lookup dict which country
+          such as Thailand as the key and its re'''
+          cache[f] = final_input
+          if keyword_appear[0] == True:
+            for c in cache:
+              if c!=keyword_appear[1]:
+                if cache[c].lower() not in output.lower():
+                  output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
+                  chunkBFS = get_contextual_sentences_BFS(output, keyword)
+                  countryBFS = model.get_country_from_text(chunkBFS)
+                  countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
+                  if countryDFS != "unknown" and countryBFS != "unknown":
+                    if len(chunkDFS) <= len(chunkBFS):
+                      return countryDFS, chunkDFS, output
+                    else:
+                      return countryBFS, chunkBFS, output
+                  else:
+                    if countryDFS != "unknown":
+                      return countryDFS, chunkDFS, output
+                    if countryBFS != "unknown":
+                      return countryBFS, chunkBFS, output
+          else:
+            if cache[f].lower() not in output.lower():
+              output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
+  if len(output) == 0 or keyword_appear[0]==False:
+    for c in cache:
+      if cache[c].lower() not in output.lower():
+        output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
   return country, "", output

model.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

mtdna_backend.py CHANGED Viewed

@@ -1,885 +1,897 @@
-import gradio as gr
-from collections import Counter
-import csv
-import os
-from functools import lru_cache
-#import app
-from mtdna_classifier import classify_sample_location
-import data_preprocess, model, pipeline
-import subprocess
-import json
-import pandas as pd
-import io
-import re
-import tempfile
-import gspread
-from oauth2client.service_account import ServiceAccountCredentials
-from io import StringIO
-import hashlib
-import threading
-# @lru_cache(maxsize=3600)
-# def classify_sample_location_cached(accession):
-#     return classify_sample_location(accession)
-@lru_cache(maxsize=3600)
-def pipeline_classify_sample_location_cached(accession):
-    return pipeline.pipeline_with_gemini([accession])
-# Count and suggest final location
-# def compute_final_suggested_location(rows):
-#     candidates = [
-#         row.get("Predicted Location", "").strip()
-#         for row in rows
-#         if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
-#     ] + [
-#         row.get("Inferred Region", "").strip()
-#         for row in rows
-#         if row.get("Inferred Region", "").strip().lower() not in  ["", "sample id not found", "unknown"]
-#     ]
-#     if not candidates:
-#         return Counter(), ("Unknown", 0)
-#     # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
-#     tokens = []
-#     for item in candidates:
-#         # Split by comma, whitespace, and newlines
-#         parts = re.split(r'[\s,]+', item)
-#         tokens.extend(parts)
-#     # Step 2: Clean and normalize tokens
-#     tokens = [word.strip() for word in tokens if word.strip().isalpha()]  # Keep only alphabetic tokens
-#     # Step 3: Count
-#     counts = Counter(tokens)
-#     # Step 4: Get most common
-#     top_location, count = counts.most_common(1)[0]
-#     return counts, (top_location, count)
-# Store feedback (with required fields)
-def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
-    if not answer1.strip() or not answer2.strip():
-        return "⚠️ Please answer both questions before submitting."
-    try:
-        # ✅ Step: Load credentials from Hugging Face secret
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        # Connect to Google Sheet
-        client = gspread.authorize(creds)
-        sheet = client.open("feedback_mtdna").sheet1  # make sure sheet name matches
-        # Append feedback
-        sheet.append_row([accession, answer1, answer2, contact])
-        return "✅ Feedback submitted. Thank you!"
-    except Exception as e:
-        return f"❌ Error submitting feedback: {e}"
-# helper function to extract accessions
-def extract_accessions_from_input(file=None, raw_text=""):
-    print(f"RAW TEXT RECEIVED: {raw_text}")
-    accessions = []
-    seen = set()
-    if file:
-        try:
-            if file.name.endswith(".csv"):
-                df = pd.read_csv(file)
-            elif file.name.endswith(".xlsx"):
-                df = pd.read_excel(file)
-            else:
-                return [], "Unsupported file format. Please upload CSV or Excel."
-            for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
-                if acc not in seen:
-                    accessions.append(acc)
-                    seen.add(acc)
-        except Exception as e:
-            return [], f"Failed to read file: {e}"
-    if raw_text:
-        text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
-        for acc in text_ids:
-            if acc not in seen:
-                accessions.append(acc)
-                seen.add(acc)
-    return list(accessions), None
-# ✅ Add a new helper to backend: `filter_unprocessed_accessions()`
-def get_incomplete_accessions(file_path):
-    df = pd.read_excel(file_path)
-    incomplete_accessions = []
-    for _, row in df.iterrows():
-        sample_id = str(row.get("Sample ID", "")).strip()
-        # Skip if no sample ID
-        if not sample_id:
-            continue
-        # Drop the Sample ID and check if the rest is empty
-        other_cols = row.drop(labels=["Sample ID"], errors="ignore")
-        if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
-            # Extract the accession number from the sample ID using regex
-            match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
-            if match:
-                incomplete_accessions.append(match.group(0))
-    print(len(incomplete_accessions))
-    return incomplete_accessions
-# GOOGLE_SHEET_NAME = "known_samples"
-# USAGE_DRIVE_FILENAME = "user_usage_log.json"
-def summarize_results(accession):
-    # try cache first
-    cached = check_known_output(accession)
-    if cached:
-        print(f"✅ Using cached result for {accession}")
-        return [[
-            cached["Sample ID"] or "unknown",
-            cached["Predicted Country"] or "unknown",
-            cached["Country Explanation"] or "unknown",
-            cached["Predicted Sample Type"] or "unknown",
-            cached["Sample Type Explanation"] or "unknown",
-            cached["Sources"] or "No Links",
-            cached["Time cost"]
-        ]]
-    # only run when nothing in the cache
-    try:
-        print("try gemini pipeline: ",accession)
-        outputs = pipeline_classify_sample_location_cached(accession)
-        # outputs = {'KU131308': {'isolate':'BRU18',
-        # 'country': {'brunei': ['ncbi',
-        # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
-        # 'sample_type': {'modern':
-        # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
-        # 'query_cost': 9.754999999999999e-05,
-        # 'time_cost': '24.776 seconds',
-        # 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
-        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
-        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
-    except Exception as e:
-        return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
-    if accession not in outputs:
-        print("no accession in output ", accession)
-        return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
-    row_score = []
-    rows = []
-    save_rows = []
-    for key in outputs:
-        pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
-        for section, results in outputs[key].items():
-          if section == "country" or section =="sample_type":
-            pred_output = []#"\n".join(list(results.keys()))
-            output_explanation = ""
-            for result, content in results.items():
-              if len(result) == 0:  result = "unknown"
-              if len(content) == 0: output_explanation = "unknown"
-              else:
-                output_explanation += 'Method: ' + "\nMethod: ".join(content)  + "\n"
-              pred_output.append(result)
-            pred_output = "\n".join(pred_output)
-            if section == "country":
-              pred_country, country_explanation = pred_output, output_explanation
-            elif section == "sample_type":
-              pred_sample, sample_explanation = pred_output, output_explanation
-          if outputs[key]["isolate"].lower()!="unknown":
-            label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
-          else: label = key
-        if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
-        row = {
-            "Sample ID": label or "unknown",
-            "Predicted Country": pred_country or "unknown",
-            "Country Explanation": country_explanation or "unknown",
-            "Predicted Sample Type":pred_sample or "unknown",
-            "Sample Type Explanation":sample_explanation or "unknown",
-            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
-            "Time cost": outputs[key]["time_cost"]
-        }
-        #row_score.append(row)
-        rows.append(list(row.values()))
-        save_row = {
-            "Sample ID": label or "unknown",
-            "Predicted Country": pred_country or "unknown",
-            "Country Explanation": country_explanation or "unknown",
-            "Predicted Sample Type":pred_sample or "unknown",
-            "Sample Type Explanation":sample_explanation or "unknown",
-            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
-            "Query_cost": outputs[key]["query_cost"],
-            "Time cost": outputs[key]["time_cost"]
-        }
-        #row_score.append(row)
-        save_rows.append(list(save_row.values()))
-    # #location_counts, (final_location, count) = compute_final_suggested_location(row_score)
-    # summary_lines = [f"### 🧭 Location Summary:\n"]
-    # summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
-    # summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
-    # summary = "\n".join(summary_lines)
-    # save the new running sample to known excel file
-    # try:
-    #   df_new = pd.DataFrame(save_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Query_cost","Time cost"])
-    #   if os.path.exists(KNOWN_OUTPUT_PATH):
-    #       df_old = pd.read_excel(KNOWN_OUTPUT_PATH)
-    #       df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
-    #   else:
-    #       df_combined = df_new
-    #   df_combined.to_excel(KNOWN_OUTPUT_PATH, index=False)
-    # except Exception as e:
-    #   print(f"⚠️ Failed to save known output: {e}")
-    # try:
-    #     df_new = pd.DataFrame(save_rows, columns=[
-    #         "Sample ID", "Predicted Country", "Country Explanation",
-    #         "Predicted Sample Type", "Sample Type Explanation",
-    #         "Sources", "Query_cost", "Time cost"
-    #     ])
-    #     # ✅ Google Sheets API setup
-    #     creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-    #     scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-    #     creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-    #     client = gspread.authorize(creds)
-    #     # ✅ Open the known_samples sheet
-    #     spreadsheet = client.open("known_samples")  # Replace with your sheet name
-    #     sheet = spreadsheet.sheet1
-    #     # ✅ Read old data
-    #     existing_data = sheet.get_all_values()
-    #     if existing_data:
-    #         df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-    #     else:
-    #         df_old = pd.DataFrame(columns=df_new.columns)
-    #     # ✅ Combine and remove duplicates
-    #     df_combined = pd.concat([df_old, df_new], ignore_index=True).drop_duplicates(subset="Sample ID")
-    #     # ✅ Clear and write back
-    #     sheet.clear()
-    #     sheet.update([df_combined.columns.values.tolist()] + df_combined.values.tolist())
-    # except Exception as e:
-    #     print(f"⚠️ Failed to save known output to Google Sheets: {e}")
-    try:
-        # Prepare as DataFrame
-        df_new = pd.DataFrame(save_rows, columns=[
-            "Sample ID", "Predicted Country", "Country Explanation",
-            "Predicted Sample Type", "Sample Type Explanation",
-            "Sources", "Query_cost", "Time cost"
-        ])
-        # ✅ Setup Google Sheets
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        spreadsheet = client.open("known_samples")
-        sheet = spreadsheet.sheet1
-        # ✅ Read existing data
-        existing_data = sheet.get_all_values()
-        if existing_data:
-            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-        else:
-            df_old = pd.DataFrame(columns=[
-                "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
-                "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
-                "Query_cost", "Sample Type Explanation", "Sources", "Time cost"
-            ])
-        # ✅ Index by Sample ID
-        df_old.set_index("Sample ID", inplace=True)
-        df_new.set_index("Sample ID", inplace=True)
-        # ✅ Update only matching fields
-        update_columns = [
-            "Predicted Country", "Predicted Sample Type", "Country Explanation",
-            "Sample Type Explanation", "Sources", "Query_cost", "Time cost"
-        ]
-        for idx, row in df_new.iterrows():
-            if idx not in df_old.index:
-                df_old.loc[idx] = ""  # new row, fill empty first
-            for col in update_columns:
-                if pd.notna(row[col]) and row[col] != "":
-                    df_old.at[idx, col] = row[col]
-        # ✅ Reset and write back
-        df_old.reset_index(inplace=True)
-        sheet.clear()
-        sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
-        print("✅ Match results saved to known_samples.")
-    except Exception as e:
-        print(f"❌ Failed to update known_samples: {e}")
-    return rows#, summary, labelAncient_Modern, explain_label
-# save the batch input in excel file
-# def save_to_excel(all_rows, summary_text, flag_text, filename):
-#     with pd.ExcelWriter(filename) as writer:
-#         # Save table
-#         df_new = pd.DataFrame(all_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
-#         df.to_excel(writer, sheet_name="Detailed Results", index=False)
-#         try:
-#           df_old = pd.read_excel(filename)
-#         except:
-#           df_old = pd.DataFrame([[]], columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
-#         df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
-#         # if os.path.exists(filename):
-#         #   df_old = pd.read_excel(filename)
-#         #   df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
-#         # else:
-#         #     df_combined = df_new
-#         df_combined.to_excel(filename, index=False)
-#           # # Save summary
-#         # summary_df = pd.DataFrame({"Summary": [summary_text]})
-#         # summary_df.to_excel(writer, sheet_name="Summary", index=False)
-#         # # Save flag
-#         # flag_df = pd.DataFrame({"Flag": [flag_text]})
-#         # flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
-# def save_to_excel(all_rows, summary_text, flag_text, filename):
-#     df_new = pd.DataFrame(all_rows, columns=[
-#         "Sample ID", "Predicted Country", "Country Explanation",
-#         "Predicted Sample Type", "Sample Type Explanation",
-#         "Sources", "Time cost"
-#     ])
-#     try:
-#         if os.path.exists(filename):
-#             df_old = pd.read_excel(filename)
-#         else:
-#             df_old = pd.DataFrame(columns=df_new.columns)
-#     except Exception as e:
-#         print(f"⚠️ Warning reading old Excel file: {e}")
-#         df_old = pd.DataFrame(columns=df_new.columns)
-#     #df_combined = pd.concat([df_new, df_old], ignore_index=True).drop_duplicates(subset="Sample ID", keep="first")
-#     df_old.set_index("Sample ID", inplace=True)
-#     df_new.set_index("Sample ID", inplace=True)
-#     df_old.update(df_new)  # <-- update matching rows in df_old with df_new content
-#     df_combined = df_old.reset_index()
-#     try:
-#         df_combined.to_excel(filename, index=False)
-#     except Exception as e:
-#         print(f"❌ Failed to write Excel file {filename}: {e}")
-def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
-    df_new = pd.DataFrame(all_rows, columns=[
-        "Sample ID", "Predicted Country", "Country Explanation",
-        "Predicted Sample Type", "Sample Type Explanation",
-        "Sources", "Time cost"
-    ])
-    if is_resume and os.path.exists(filename):
-        try:
-            df_old = pd.read_excel(filename)
-        except Exception as e:
-            print(f"⚠️ Warning reading old Excel file: {e}")
-            df_old = pd.DataFrame(columns=df_new.columns)
-        # Set index and update existing rows
-        df_old.set_index("Sample ID", inplace=True)
-        df_new.set_index("Sample ID", inplace=True)
-        df_old.update(df_new)
-        df_combined = df_old.reset_index()
-    else:
-        # If not resuming or file doesn't exist, just use new rows
-        df_combined = df_new
-    try:
-        df_combined.to_excel(filename, index=False)
-    except Exception as e:
-        print(f"❌ Failed to write Excel file {filename}: {e}")
-# save the batch input in JSON file
-def save_to_json(all_rows, summary_text, flag_text, filename):
-    output_dict = {
-        "Detailed_Results": all_rows#,  # <-- make sure this is a plain list, not a DataFrame
-        # "Summary_Text": summary_text,
-        # "Ancient_Modern_Flag": flag_text
-    }
-    # If all_rows is a DataFrame, convert it
-    if isinstance(all_rows, pd.DataFrame):
-        output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
-    with open(filename, "w") as external_file:
-        json.dump(output_dict, external_file, indent=2)
-# save the batch input in Text file
-def save_to_txt(all_rows, summary_text, flag_text, filename):
-    if isinstance(all_rows, pd.DataFrame):
-        detailed_results = all_rows.to_dict(orient="records")
-    output = ""
-    #output += ",".join(list(detailed_results[0].keys())) + "\n\n"
-    output += ",".join([str(k) for k in detailed_results[0].keys()]) + "\n\n"
-    for r in detailed_results:
-      output += ",".join([str(v) for v in r.values()]) + "\n\n"
-    with open(filename, "w") as f:
-        f.write("=== Detailed Results ===\n")
-        f.write(output + "\n")
-        # f.write("\n=== Summary ===\n")
-        # f.write(summary_text + "\n")
-        # f.write("\n=== Ancient/Modern Flag ===\n")
-        # f.write(flag_text + "\n")
-def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
-    tmp_dir = tempfile.mkdtemp()
-    #html_table = all_rows.value  # assuming this is stored somewhere
-    # Parse back to DataFrame
-    #all_rows = pd.read_html(all_rows)[0]  # [0] because read_html returns a list
-    all_rows = pd.read_html(StringIO(all_rows))[0]
-    print(all_rows)
-    if output_type == "Excel":
-        file_path = f"{tmp_dir}/batch_output.xlsx"
-        save_to_excel(all_rows, summary_text, flag_text, file_path)
-    elif output_type == "JSON":
-        file_path = f"{tmp_dir}/batch_output.json"
-        save_to_json(all_rows, summary_text, flag_text, file_path)
-        print("Done with JSON")
-    elif output_type == "TXT":
-        file_path = f"{tmp_dir}/batch_output.txt"
-        save_to_txt(all_rows, summary_text, flag_text, file_path)
-    else:
-        return gr.update(visible=False)  # invalid option
-    return gr.update(value=file_path, visible=True)
-# save cost by checking the known outputs
-# def check_known_output(accession):
-#     if not os.path.exists(KNOWN_OUTPUT_PATH):
-#         return None
-#     try:
-#         df = pd.read_excel(KNOWN_OUTPUT_PATH)
-#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-#         if match:
-#           accession = match.group(0)
-#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
-#         if not matched.empty:
-#             return matched.iloc[0].to_dict()  # Return the cached row
-#     except Exception as e:
-#         print(f"⚠️ Failed to load known samples: {e}")
-#         return None
-# def check_known_output(accession):
-#     try:
-#         # ✅ Load credentials from Hugging Face secret
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         # ✅ Open the known_samples sheet
-#         spreadsheet = client.open("known_samples")  # Replace with your sheet name
-#         sheet = spreadsheet.sheet1
-#         # ✅ Read all rows
-#         data = sheet.get_all_values()
-#         if not data:
-#             return None
-#         df = pd.DataFrame(data[1:], columns=data[0])  # Skip header row
-#         # ✅ Normalize accession pattern
-#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-#         if match:
-#             accession = match.group(0)
-#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
-#         if not matched.empty:
-#             return matched.iloc[0].to_dict()
-#     except Exception as e:
-#         print(f"⚠️ Failed to load known samples from Google Sheets: {e}")
-#         return None
-def check_known_output(accession):
-    try:
-        # ✅ Load credentials from Hugging Face secret
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        spreadsheet = client.open("known_samples")
-        sheet = spreadsheet.sheet1
-        data = sheet.get_all_values()
-        if not data:
-            print("⚠️ Google Sheet 'known_samples' is empty.")
-            return None
-        df = pd.DataFrame(data[1:], columns=data[0])
-        if "Sample ID" not in df.columns:
-            print("❌ Column 'Sample ID' not found in Google Sheet.")
-            return None
-        match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-        if match:
-            accession = match.group(0)
-        matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
-        if not matched.empty:
-            #return matched.iloc[0].to_dict()
-            row = matched.iloc[0]
-            country = row.get("Predicted Country", "").strip().lower()
-            sample_type = row.get("Predicted Sample Type", "").strip().lower()
-            if country and country != "unknown" and sample_type and sample_type != "unknown":
-                return row.to_dict()
-            else:
-                print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
-                return None
-        else:
-            print(f"🔍 Accession {accession} not found in known_samples.")
-            return None
-    except Exception as e:
-        import traceback
-        print("❌ Exception occurred during check_known_output:")
-        traceback.print_exc()
-        return None
-def hash_user_id(user_input):
-    return hashlib.sha256(user_input.encode()).hexdigest()
-# ✅ Load and save usage count
-# def load_user_usage():
-#     if not os.path.exists(USER_USAGE_TRACK_FILE):
-#         return {}
-#     try:
-#         with open(USER_USAGE_TRACK_FILE, "r") as f:
-#             content = f.read().strip()
-#             if not content:
-#                 return {}  # file is empty
-#             return json.loads(content)
-#     except (json.JSONDecodeError, ValueError):
-#         print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
-#         return {}  # fallback to empty dict
-# def load_user_usage():
-#     try:
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         sheet = client.open("user_usage_log").sheet1
-#         data = sheet.get_all_records()  # Assumes columns: email, usage_count
-#         usage = {}
-#         for row in data:
-#             email = row.get("email", "").strip().lower()
-#             count = int(row.get("usage_count", 0))
-#             if email:
-#                 usage[email] = count
-#         return usage
-#     except Exception as e:
-#         print(f"⚠️ Failed to load user usage from Google Sheets: {e}")
-#         return {}
-# def load_user_usage():
-#     try:
-#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
-#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
-#         found = pipeline.find_drive_file("user_usage_log.json", parent_id=iterate3_id)
-#         if not found:
-#             return {}  # not found, start fresh
-#         #file_id = found[0]["id"]
-#         file_id = found
-#         content = pipeline.download_drive_file_content(file_id)
-#         return json.loads(content.strip()) if content.strip() else {}
-#     except Exception as e:
-#         print(f"⚠️ Failed to load user_usage_log.json from Google Drive: {e}")
-#         return {}
-def load_user_usage():
-    try:
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        sheet = client.open("user_usage_log").sheet1
-        data = sheet.get_all_values()
-        print("data: ", data)
-        print("🧪 Raw header row from sheet:", data[0])
-        print("🧪 Character codes in each header:")
-        for h in data[0]:
-            print([ord(c) for c in h])
-        if not data or len(data) < 2:
-            print("⚠️ Sheet is empty or missing rows.")
-            return {}
-        headers = [h.strip().lower() for h in data[0]]
-        if "email" not in headers or "usage_count" not in headers:
-            print("❌ Header format incorrect. Must have 'email' and 'usage_count'.")
-            return {}
-        df = pd.DataFrame(data[1:], columns=headers)
-        usage = {}
-        for _, row in df.iterrows():
-            email = row.get("email", "").strip().lower()
-            try:
-                #count = int(row.get("usage_count", 0))
-                try:
-                    count = int(float(row.get("usage_count", 0)))
-                except Exception:
-                    print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
-                    count = 0
-                if email:
-                    usage[email] = count
-            except ValueError:
-                print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
-        return usage
-    except Exception as e:
-        print(f"❌ Error in load_user_usage: {e}")
-        return {}
-# def save_user_usage(usage):
-#     with open(USER_USAGE_TRACK_FILE, "w") as f:
-#         json.dump(usage, f, indent=2)
-# def save_user_usage(usage_dict):
-#     try:
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         sheet = client.open("user_usage_log").sheet1
-#         sheet.clear()  # clear old contents first
-#         # Write header + rows
-#         rows = [["email", "usage_count"]] + [[email, count] for email, count in usage_dict.items()]
-#         sheet.update(rows)
-#     except Exception as e:
-#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
-# def save_user_usage(usage_dict):
-#     try:
-#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
-#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
-#         import tempfile
-#         tmp_path = os.path.join(tempfile.gettempdir(), "user_usage_log.json")
-#         print("💾 Saving this usage dict:", usage_dict)
-#         with open(tmp_path, "w") as f:
-#             json.dump(usage_dict, f, indent=2)
-#         pipeline.upload_file_to_drive(tmp_path, "user_usage_log.json", iterate3_id)
-#     except Exception as e:
-#         print(f"❌ Failed to save user_usage_log.json to Google Drive: {e}")
-# def save_user_usage(usage_dict):
-#     try:
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         spreadsheet = client.open("user_usage_log")
-#         sheet = spreadsheet.sheet1
-#         # Step 1: Convert new usage to DataFrame
-#         df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
-#         df_new["email"] = df_new["email"].str.strip().str.lower()
-#         # Step 2: Load existing data
-#         existing_data = sheet.get_all_values()
-#         print("🧪 Sheet existing_data:", existing_data)
-#         # Try to load old data
-#         if existing_data and len(existing_data[0]) >= 1:
-#             df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-#             # Fix missing columns
-#             if "email" not in df_old.columns:
-#                 df_old["email"] = ""
-#             if "usage_count" not in df_old.columns:
-#                 df_old["usage_count"] = 0
-#             df_old["email"] = df_old["email"].str.strip().str.lower()
-#             df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
-#         else:
-#             df_old = pd.DataFrame(columns=["email", "usage_count"])
-#         # Step 3: Merge
-#         df_combined = pd.concat([df_old, df_new], ignore_index=True)
-#         df_combined = df_combined.groupby("email", as_index=False).sum()
-#         # Step 4: Write back
-#         sheet.clear()
-#         sheet.update([df_combined.columns.tolist()] + df_combined.astype(str).values.tolist())
-#         print("✅ Saved user usage to user_usage_log sheet.")
-#     except Exception as e:
-#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
-def save_user_usage(usage_dict):
-    try:
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        spreadsheet = client.open("user_usage_log")
-        sheet = spreadsheet.sheet1
-        # Build new df
-        df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
-        df_new["email"] = df_new["email"].str.strip().str.lower()
-        df_new["usage_count"] = pd.to_numeric(df_new["usage_count"], errors="coerce").fillna(0).astype(int)
-        # Read existing data
-        existing_data = sheet.get_all_values()
-        if existing_data and len(existing_data[0]) >= 2:
-            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-            df_old["email"] = df_old["email"].str.strip().str.lower()
-            df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
-        else:
-            df_old = pd.DataFrame(columns=["email", "usage_count"])
-        # ✅ Overwrite specific emails only
-        df_old = df_old.set_index("email")
-        for email, count in usage_dict.items():
-            email = email.strip().lower()
-            df_old.loc[email, "usage_count"] = count
-        df_old = df_old.reset_index()
-        # Save
-        sheet.clear()
-        sheet.update([df_old.columns.tolist()] + df_old.astype(str).values.tolist())
-        print("✅ Saved user usage to user_usage_log sheet.")
-    except Exception as e:
-        print(f"❌ Failed to save user usage to Google Sheets: {e}")
-# def increment_usage(user_id, num_samples=1):
-#     usage = load_user_usage()
-#     if user_id not in usage:
-#         usage[user_id] = 0
-#     usage[user_id] += num_samples
-#     save_user_usage(usage)
-#     return usage[user_id]
-# def increment_usage(email: str, count: int):
-#     usage = load_user_usage()
-#     email_key = email.strip().lower()
-#     usage[email_key] = usage.get(email_key, 0) + count
-#     save_user_usage(usage)
-#     return usage[email_key]
-def increment_usage(email: str, count: int = 1):
-    usage = load_user_usage()
-    email_key = email.strip().lower()
-    #usage[email_key] = usage.get(email_key, 0) + count
-    current = usage.get(email_key, 0)
-    new_value = current + count
-    usage[email_key] = max(current, new_value)  # ✅ Prevent overwrite with lower
-    print(f"🧪 increment_usage saving: {email_key=} {current=} + {count=} => {usage[email_key]=}")
-    save_user_usage(usage)
-    return usage[email_key]
-# run the batch
-def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
-                    stop_flag=None, output_file_path=None,
-                    limited_acc=50, yield_callback=None):
-    if user_email:
-      limited_acc += 10
-    accessions, error = extract_accessions_from_input(file, raw_text)
-    if error:
-        #return [], "", "", f"Error: {error}"
-        return [], f"Error: {error}", 0, "", ""
-    if resume_file:
-      accessions = get_incomplete_accessions(resume_file)
-    tmp_dir = tempfile.mkdtemp()
-    if not output_file_path:
-      if resume_file:
-        output_file_path = os.path.join(tmp_dir, resume_file)
-      else:
-        output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
-    all_rows = []
-    # all_summaries = []
-    # all_flags = []
-    progress_lines = []
-    warning = ""
-    if len(accessions) > limited_acc:
-      accessions = accessions[:limited_acc]
-      warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
-    for i, acc in enumerate(accessions):
-        if stop_flag and stop_flag.value:
-            line = f"🛑 Stopped at {acc} ({i+1}/{len(accessions)})"
-            progress_lines.append(line)
-            if yield_callback:
-              yield_callback(line)
-            print("🛑 User requested stop.")
-            break
-        print(f"[{i+1}/{len(accessions)}] Processing {acc}")
-        try:
-            # rows, summary, label, explain = summarize_results(acc)
-            rows = summarize_results(acc)
-            all_rows.extend(rows)
-            # all_summaries.append(f"**{acc}**\n{summary}")
-            # all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
-            #save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
-            save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
-            line = f"✅ Processed {acc} ({i+1}/{len(accessions)})"
-            progress_lines.append(line)
-            if yield_callback:
-              yield_callback(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
-        except Exception as e:
-            print(f"❌ Failed to process {acc}: {e}")
-            continue
-            #all_summaries.append(f"**{acc}**: Failed - {e}")
-        #progress_lines.append(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
-        limited_acc -= 1
-    """for row in all_rows:
-          source_column = row[2]  # Assuming the "Source" is in the 3rd column (index 2)
-          if source_column.startswith("http"):  # Check if the source is a URL
-              # Wrap it with HTML anchor tags to make it clickable
-              row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
-    if not warning:
-      warning = f"You only have {limited_acc} left"
-    if user_email.strip():
-        user_hash = hash_user_id(user_email)
-        total_queries = increment_usage(user_hash, len(all_rows))
-    else:
-        total_queries = 0
-    yield_callback("✅ Finished!")
-    # summary_text = "\n\n---\n\n".join(all_summaries)
-    # flag_text = "\n\n---\n\n".join(all_flags)
-    #return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
-    #return all_rows, gr.update(visible=True), gr.update(visible=False)
     return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning

+import gradio as gr
+from collections import Counter
+import csv
+import os
+from functools import lru_cache
+#import app
+from mtdna_classifier import classify_sample_location
+import data_preprocess, model, pipeline
+import subprocess
+import json
+import pandas as pd
+import io
+import re
+import tempfile
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+from io import StringIO
+import hashlib
+import threading
+# @lru_cache(maxsize=3600)
+# def classify_sample_location_cached(accession):
+#     return classify_sample_location(accession)
+@lru_cache(maxsize=3600)
+def pipeline_classify_sample_location_cached(accession):
+    print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
+    return pipeline.pipeline_with_gemini([accession])
+# Count and suggest final location
+# def compute_final_suggested_location(rows):
+#     candidates = [
+#         row.get("Predicted Location", "").strip()
+#         for row in rows
+#         if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
+#     ] + [
+#         row.get("Inferred Region", "").strip()
+#         for row in rows
+#         if row.get("Inferred Region", "").strip().lower() not in  ["", "sample id not found", "unknown"]
+#     ]
+#     if not candidates:
+#         return Counter(), ("Unknown", 0)
+#     # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
+#     tokens = []
+#     for item in candidates:
+#         # Split by comma, whitespace, and newlines
+#         parts = re.split(r'[\s,]+', item)
+#         tokens.extend(parts)
+#     # Step 2: Clean and normalize tokens
+#     tokens = [word.strip() for word in tokens if word.strip().isalpha()]  # Keep only alphabetic tokens
+#     # Step 3: Count
+#     counts = Counter(tokens)
+#     # Step 4: Get most common
+#     top_location, count = counts.most_common(1)[0]
+#     return counts, (top_location, count)
+# Store feedback (with required fields)
+def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
+    if not answer1.strip() or not answer2.strip():
+        return "⚠️ Please answer both questions before submitting."
+    try:
+        # ✅ Step: Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        # Connect to Google Sheet
+        client = gspread.authorize(creds)
+        sheet = client.open("feedback_mtdna").sheet1  # make sure sheet name matches
+        # Append feedback
+        sheet.append_row([accession, answer1, answer2, contact])
+        return "✅ Feedback submitted. Thank you!"
+    except Exception as e:
+        return f"❌ Error submitting feedback: {e}"
+# helper function to extract accessions
+def extract_accessions_from_input(file=None, raw_text=""):
+    print(f"RAW TEXT RECEIVED: {raw_text}")
+    accessions = []
+    seen = set()
+    if file:
+        try:
+            if file.name.endswith(".csv"):
+                df = pd.read_csv(file)
+            elif file.name.endswith(".xlsx"):
+                df = pd.read_excel(file)
+            else:
+                return [], "Unsupported file format. Please upload CSV or Excel."
+            for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
+                if acc not in seen:
+                    accessions.append(acc)
+                    seen.add(acc)
+        except Exception as e:
+            return [], f"Failed to read file: {e}"
+    if raw_text:
+        text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
+        for acc in text_ids:
+            if acc not in seen:
+                accessions.append(acc)
+                seen.add(acc)
+    return list(accessions), None
+# ✅ Add a new helper to backend: `filter_unprocessed_accessions()`
+def get_incomplete_accessions(file_path):
+    df = pd.read_excel(file_path)
+    incomplete_accessions = []
+    for _, row in df.iterrows():
+        sample_id = str(row.get("Sample ID", "")).strip()
+        # Skip if no sample ID
+        if not sample_id:
+            continue
+        # Drop the Sample ID and check if the rest is empty
+        other_cols = row.drop(labels=["Sample ID"], errors="ignore")
+        if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
+            # Extract the accession number from the sample ID using regex
+            match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
+            if match:
+                incomplete_accessions.append(match.group(0))
+    print(len(incomplete_accessions))
+    return incomplete_accessions
+# GOOGLE_SHEET_NAME = "known_samples"
+# USAGE_DRIVE_FILENAME = "user_usage_log.json"
+def summarize_results(accession):
+    # try cache first
+    cached = check_known_output(accession)
+    if cached:
+        print(f"✅ Using cached result for {accession}")
+        return [[
+            cached["Sample ID"] or "unknown",
+            cached["Predicted Country"] or "unknown",
+            cached["Country Explanation"] or "unknown",
+            cached["Predicted Sample Type"] or "unknown",
+            cached["Sample Type Explanation"] or "unknown",
+            cached["Sources"] or "No Links",
+            cached["Time cost"]
+        ]]
+    # only run when nothing in the cache
+    try:
+        print("try gemini pipeline: ",accession)
+        outputs = pipeline_classify_sample_location_cached(accession)
+        # outputs = {'KU131308': {'isolate':'BRU18',
+        # 'country': {'brunei': ['ncbi',
+        # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
+        # 'sample_type': {'modern':
+        # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
+        # 'query_cost': 9.754999999999999e-05,
+        # 'time_cost': '24.776 seconds',
+        # 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
+        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
+        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
+    except Exception as e:
+        return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
+    if accession not in outputs:
+        print("no accession in output ", accession)
+        return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
+    row_score = []
+    rows = []
+    save_rows = []
+    for key in outputs:
+        pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
+        for section, results in outputs[key].items():
+          if section == "country" or section =="sample_type":
+            pred_output = []#"\n".join(list(results.keys()))
+            output_explanation = ""
+            for result, content in results.items():
+              if len(result) == 0:  result = "unknown"
+              if len(content) == 0: output_explanation = "unknown"
+              else:
+                output_explanation += 'Method: ' + "\nMethod: ".join(content)  + "\n"
+              pred_output.append(result)
+            pred_output = "\n".join(pred_output)
+            if section == "country":
+              pred_country, country_explanation = pred_output, output_explanation
+            elif section == "sample_type":
+              pred_sample, sample_explanation = pred_output, output_explanation
+          if outputs[key]["isolate"].lower()!="unknown":
+            label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
+          else: label = key
+        if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
+        row = {
+            "Sample ID": label or "unknown",
+            "Predicted Country": pred_country or "unknown",
+            "Country Explanation": country_explanation or "unknown",
+            "Predicted Sample Type":pred_sample or "unknown",
+            "Sample Type Explanation":sample_explanation or "unknown",
+            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
+            "Time cost": outputs[key]["time_cost"]
+        }
+        #row_score.append(row)
+        rows.append(list(row.values()))
+        save_row = {
+            "Sample ID": label or "unknown",
+            "Predicted Country": pred_country or "unknown",
+            "Country Explanation": country_explanation or "unknown",
+            "Predicted Sample Type":pred_sample or "unknown",
+            "Sample Type Explanation":sample_explanation or "unknown",
+            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
+            "Query_cost": outputs[key]["query_cost"],
+            "Time cost": outputs[key]["time_cost"]
+        }
+        #row_score.append(row)
+        save_rows.append(list(save_row.values()))
+    # #location_counts, (final_location, count) = compute_final_suggested_location(row_score)
+    # summary_lines = [f"### 🧭 Location Summary:\n"]
+    # summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
+    # summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
+    # summary = "\n".join(summary_lines)
+    # save the new running sample to known excel file
+    # try:
+    #   df_new = pd.DataFrame(save_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Query_cost","Time cost"])
+    #   if os.path.exists(KNOWN_OUTPUT_PATH):
+    #       df_old = pd.read_excel(KNOWN_OUTPUT_PATH)
+    #       df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
+    #   else:
+    #       df_combined = df_new
+    #   df_combined.to_excel(KNOWN_OUTPUT_PATH, index=False)
+    # except Exception as e:
+    #   print(f"⚠️ Failed to save known output: {e}")
+    # try:
+    #     df_new = pd.DataFrame(save_rows, columns=[
+    #         "Sample ID", "Predicted Country", "Country Explanation",
+    #         "Predicted Sample Type", "Sample Type Explanation",
+    #         "Sources", "Query_cost", "Time cost"
+    #     ])
+    #     # ✅ Google Sheets API setup
+    #     creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+    #     scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+    #     creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+    #     client = gspread.authorize(creds)
+    #     # ✅ Open the known_samples sheet
+    #     spreadsheet = client.open("known_samples")  # Replace with your sheet name
+    #     sheet = spreadsheet.sheet1
+    #     # ✅ Read old data
+    #     existing_data = sheet.get_all_values()
+    #     if existing_data:
+    #         df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+    #     else:
+    #         df_old = pd.DataFrame(columns=df_new.columns)
+    #     # ✅ Combine and remove duplicates
+    #     df_combined = pd.concat([df_old, df_new], ignore_index=True).drop_duplicates(subset="Sample ID")
+    #     # ✅ Clear and write back
+    #     sheet.clear()
+    #     sheet.update([df_combined.columns.values.tolist()] + df_combined.values.tolist())
+    # except Exception as e:
+    #     print(f"⚠️ Failed to save known output to Google Sheets: {e}")
+    try:
+        # Prepare as DataFrame
+        df_new = pd.DataFrame(save_rows, columns=[
+            "Sample ID", "Predicted Country", "Country Explanation",
+            "Predicted Sample Type", "Sample Type Explanation",
+            "Sources", "Query_cost", "Time cost"
+        ])
+        # ✅ Setup Google Sheets
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("known_samples")
+        sheet = spreadsheet.sheet1
+        # ✅ Read existing data
+        existing_data = sheet.get_all_values()
+        if existing_data:
+            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+        else:
+            df_old = pd.DataFrame(columns=[
+                "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
+                "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
+                "Query_cost", "Sample Type Explanation", "Sources", "Time cost"
+            ])
+        # ✅ Index by Sample ID
+        df_old.set_index("Sample ID", inplace=True)
+        df_new.set_index("Sample ID", inplace=True)
+        # ✅ Update only matching fields
+        update_columns = [
+            "Predicted Country", "Predicted Sample Type", "Country Explanation",
+            "Sample Type Explanation", "Sources", "Query_cost", "Time cost"
+        ]
+        for idx, row in df_new.iterrows():
+            if idx not in df_old.index:
+                df_old.loc[idx] = ""  # new row, fill empty first
+            for col in update_columns:
+                if pd.notna(row[col]) and row[col] != "":
+                    df_old.at[idx, col] = row[col]
+        # ✅ Reset and write back
+        df_old.reset_index(inplace=True)
+        sheet.clear()
+        sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
+        print("✅ Match results saved to known_samples.")
+    except Exception as e:
+        print(f"❌ Failed to update known_samples: {e}")
+    return rows#, summary, labelAncient_Modern, explain_label
+# save the batch input in excel file
+# def save_to_excel(all_rows, summary_text, flag_text, filename):
+#     with pd.ExcelWriter(filename) as writer:
+#         # Save table
+#         df_new = pd.DataFrame(all_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
+#         df.to_excel(writer, sheet_name="Detailed Results", index=False)
+#         try:
+#           df_old = pd.read_excel(filename)
+#         except:
+#           df_old = pd.DataFrame([[]], columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
+#         df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
+#         # if os.path.exists(filename):
+#         #   df_old = pd.read_excel(filename)
+#         #   df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
+#         # else:
+#         #     df_combined = df_new
+#         df_combined.to_excel(filename, index=False)
+#           # # Save summary
+#         # summary_df = pd.DataFrame({"Summary": [summary_text]})
+#         # summary_df.to_excel(writer, sheet_name="Summary", index=False)
+#         # # Save flag
+#         # flag_df = pd.DataFrame({"Flag": [flag_text]})
+#         # flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
+# def save_to_excel(all_rows, summary_text, flag_text, filename):
+#     df_new = pd.DataFrame(all_rows, columns=[
+#         "Sample ID", "Predicted Country", "Country Explanation",
+#         "Predicted Sample Type", "Sample Type Explanation",
+#         "Sources", "Time cost"
+#     ])
+#     try:
+#         if os.path.exists(filename):
+#             df_old = pd.read_excel(filename)
+#         else:
+#             df_old = pd.DataFrame(columns=df_new.columns)
+#     except Exception as e:
+#         print(f"⚠️ Warning reading old Excel file: {e}")
+#         df_old = pd.DataFrame(columns=df_new.columns)
+#     #df_combined = pd.concat([df_new, df_old], ignore_index=True).drop_duplicates(subset="Sample ID", keep="first")
+#     df_old.set_index("Sample ID", inplace=True)
+#     df_new.set_index("Sample ID", inplace=True)
+#     df_old.update(df_new)  # <-- update matching rows in df_old with df_new content
+#     df_combined = df_old.reset_index()
+#     try:
+#         df_combined.to_excel(filename, index=False)
+#     except Exception as e:
+#         print(f"❌ Failed to write Excel file {filename}: {e}")
+def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
+    df_new = pd.DataFrame(all_rows, columns=[
+        "Sample ID", "Predicted Country", "Country Explanation",
+        "Predicted Sample Type", "Sample Type Explanation",
+        "Sources", "Time cost"
+    ])
+    if is_resume and os.path.exists(filename):
+        try:
+            df_old = pd.read_excel(filename)
+        except Exception as e:
+            print(f"⚠️ Warning reading old Excel file: {e}")
+            df_old = pd.DataFrame(columns=df_new.columns)
+        # Set index and update existing rows
+        df_old.set_index("Sample ID", inplace=True)
+        df_new.set_index("Sample ID", inplace=True)
+        df_old.update(df_new)
+        df_combined = df_old.reset_index()
+    else:
+        # If not resuming or file doesn't exist, just use new rows
+        df_combined = df_new
+    try:
+        df_combined.to_excel(filename, index=False)
+    except Exception as e:
+        print(f"❌ Failed to write Excel file {filename}: {e}")
+# save the batch input in JSON file
+def save_to_json(all_rows, summary_text, flag_text, filename):
+    output_dict = {
+        "Detailed_Results": all_rows#,  # <-- make sure this is a plain list, not a DataFrame
+        # "Summary_Text": summary_text,
+        # "Ancient_Modern_Flag": flag_text
+    }
+    # If all_rows is a DataFrame, convert it
+    if isinstance(all_rows, pd.DataFrame):
+        output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
+    with open(filename, "w") as external_file:
+        json.dump(output_dict, external_file, indent=2)
+# save the batch input in Text file
+def save_to_txt(all_rows, summary_text, flag_text, filename):
+    if isinstance(all_rows, pd.DataFrame):
+        detailed_results = all_rows.to_dict(orient="records")
+    output = ""
+    #output += ",".join(list(detailed_results[0].keys())) + "\n\n"
+    output += ",".join([str(k) for k in detailed_results[0].keys()]) + "\n\n"
+    for r in detailed_results:
+      output += ",".join([str(v) for v in r.values()]) + "\n\n"
+    with open(filename, "w") as f:
+        f.write("=== Detailed Results ===\n")
+        f.write(output + "\n")
+        # f.write("\n=== Summary ===\n")
+        # f.write(summary_text + "\n")
+        # f.write("\n=== Ancient/Modern Flag ===\n")
+        # f.write(flag_text + "\n")
+def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
+    tmp_dir = tempfile.mkdtemp()
+    #html_table = all_rows.value  # assuming this is stored somewhere
+    # Parse back to DataFrame
+    #all_rows = pd.read_html(all_rows)[0]  # [0] because read_html returns a list
+    all_rows = pd.read_html(StringIO(all_rows))[0]
+    print(all_rows)
+    if output_type == "Excel":
+        file_path = f"{tmp_dir}/batch_output.xlsx"
+        save_to_excel(all_rows, summary_text, flag_text, file_path)
+    elif output_type == "JSON":
+        file_path = f"{tmp_dir}/batch_output.json"
+        save_to_json(all_rows, summary_text, flag_text, file_path)
+        print("Done with JSON")
+    elif output_type == "TXT":
+        file_path = f"{tmp_dir}/batch_output.txt"
+        save_to_txt(all_rows, summary_text, flag_text, file_path)
+    else:
+        return gr.update(visible=False)  # invalid option
+    return gr.update(value=file_path, visible=True)
+# save cost by checking the known outputs
+# def check_known_output(accession):
+#     if not os.path.exists(KNOWN_OUTPUT_PATH):
+#         return None
+#     try:
+#         df = pd.read_excel(KNOWN_OUTPUT_PATH)
+#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+#         if match:
+#           accession = match.group(0)
+#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
+#         if not matched.empty:
+#             return matched.iloc[0].to_dict()  # Return the cached row
+#     except Exception as e:
+#         print(f"⚠️ Failed to load known samples: {e}")
+#         return None
+# def check_known_output(accession):
+#     try:
+#         # ✅ Load credentials from Hugging Face secret
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         # ✅ Open the known_samples sheet
+#         spreadsheet = client.open("known_samples")  # Replace with your sheet name
+#         sheet = spreadsheet.sheet1
+#         # ✅ Read all rows
+#         data = sheet.get_all_values()
+#         if not data:
+#             return None
+#         df = pd.DataFrame(data[1:], columns=data[0])  # Skip header row
+#         # ✅ Normalize accession pattern
+#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+#         if match:
+#             accession = match.group(0)
+#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
+#         if not matched.empty:
+#             return matched.iloc[0].to_dict()
+#     except Exception as e:
+#         print(f"⚠️ Failed to load known samples from Google Sheets: {e}")
+#         return None
+def check_known_output(accession):
+    try:
+        # ✅ Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("known_samples")
+        sheet = spreadsheet.sheet1
+        data = sheet.get_all_values()
+        if not data:
+            print("⚠️ Google Sheet 'known_samples' is empty.")
+            return None
+        df = pd.DataFrame(data[1:], columns=data[0])
+        if "Sample ID" not in df.columns:
+            print("❌ Column 'Sample ID' not found in Google Sheet.")
+            return None
+        match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+        if match:
+            accession = match.group(0)
+        matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
+        if not matched.empty:
+            #return matched.iloc[0].to_dict()
+            row = matched.iloc[0]
+            country = row.get("Predicted Country", "").strip().lower()
+            sample_type = row.get("Predicted Sample Type", "").strip().lower()
+            if country and country != "unknown" and sample_type and sample_type != "unknown":
+                return row.to_dict()
+            else:
+                print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
+                return None
+        else:
+            print(f"🔍 Accession {accession} not found in known_samples.")
+            return None
+    except Exception as e:
+        import traceback
+        print("❌ Exception occurred during check_known_output:")
+        traceback.print_exc()
+        return None
+def hash_user_id(user_input):
+    return hashlib.sha256(user_input.encode()).hexdigest()
+# ✅ Load and save usage count
+# def load_user_usage():
+#     if not os.path.exists(USER_USAGE_TRACK_FILE):
+#         return {}
+#     try:
+#         with open(USER_USAGE_TRACK_FILE, "r") as f:
+#             content = f.read().strip()
+#             if not content:
+#                 return {}  # file is empty
+#             return json.loads(content)
+#     except (json.JSONDecodeError, ValueError):
+#         print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
+#         return {}  # fallback to empty dict
+# def load_user_usage():
+#     try:
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         sheet = client.open("user_usage_log").sheet1
+#         data = sheet.get_all_records()  # Assumes columns: email, usage_count
+#         usage = {}
+#         for row in data:
+#             email = row.get("email", "").strip().lower()
+#             count = int(row.get("usage_count", 0))
+#             if email:
+#                 usage[email] = count
+#         return usage
+#     except Exception as e:
+#         print(f"⚠️ Failed to load user usage from Google Sheets: {e}")
+#         return {}
+# def load_user_usage():
+#     try:
+#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
+#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
+#         found = pipeline.find_drive_file("user_usage_log.json", parent_id=iterate3_id)
+#         if not found:
+#             return {}  # not found, start fresh
+#         #file_id = found[0]["id"]
+#         file_id = found
+#         content = pipeline.download_drive_file_content(file_id)
+#         return json.loads(content.strip()) if content.strip() else {}
+#     except Exception as e:
+#         print(f"⚠️ Failed to load user_usage_log.json from Google Drive: {e}")
+#         return {}
+def load_user_usage():
+    try:
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        sheet = client.open("user_usage_log").sheet1
+        data = sheet.get_all_values()
+        print("data: ", data)
+        print("🧪 Raw header row from sheet:", data[0])
+        print("🧪 Character codes in each header:")
+        for h in data[0]:
+            print([ord(c) for c in h])
+        if not data or len(data) < 2:
+            print("⚠️ Sheet is empty or missing rows.")
+            return {}
+        headers = [h.strip().lower() for h in data[0]]
+        if "email" not in headers or "usage_count" not in headers:
+            print("❌ Header format incorrect. Must have 'email' and 'usage_count'.")
+            return {}
+        permitted_index = headers.index("permitted_samples") if "permitted_samples" in headers else None
+        df = pd.DataFrame(data[1:], columns=headers)
+        usage = {}
+        permitted = {}
+        for _, row in df.iterrows():
+            email = row.get("email", "").strip().lower()
+            try:
+                #count = int(row.get("usage_count", 0))
+                try:
+                    count = int(float(row.get("usage_count", 0)))
+                except Exception:
+                    print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
+                    count = 0
+                if email:
+                    usage[email] = count
+                    if permitted_index is not None:
+                        try:
+                            permitted_count = int(float(row.get("permitted_samples", 50)))
+                            permitted[email] = permitted_count
+                        except:
+                            permitted[email] = 50
+            except ValueError:
+                print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
+        return usage, permitted
+    except Exception as e:
+        print(f"❌ Error in load_user_usage: {e}")
+        return {}, {}
+# def save_user_usage(usage):
+#     with open(USER_USAGE_TRACK_FILE, "w") as f:
+#         json.dump(usage, f, indent=2)
+# def save_user_usage(usage_dict):
+#     try:
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         sheet = client.open("user_usage_log").sheet1
+#         sheet.clear()  # clear old contents first
+#         # Write header + rows
+#         rows = [["email", "usage_count"]] + [[email, count] for email, count in usage_dict.items()]
+#         sheet.update(rows)
+#     except Exception as e:
+#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
+# def save_user_usage(usage_dict):
+#     try:
+#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
+#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
+#         import tempfile
+#         tmp_path = os.path.join(tempfile.gettempdir(), "user_usage_log.json")
+#         print("💾 Saving this usage dict:", usage_dict)
+#         with open(tmp_path, "w") as f:
+#             json.dump(usage_dict, f, indent=2)
+#         pipeline.upload_file_to_drive(tmp_path, "user_usage_log.json", iterate3_id)
+#     except Exception as e:
+#         print(f"❌ Failed to save user_usage_log.json to Google Drive: {e}")
+# def save_user_usage(usage_dict):
+#     try:
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         spreadsheet = client.open("user_usage_log")
+#         sheet = spreadsheet.sheet1
+#         # Step 1: Convert new usage to DataFrame
+#         df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
+#         df_new["email"] = df_new["email"].str.strip().str.lower()
+#         # Step 2: Load existing data
+#         existing_data = sheet.get_all_values()
+#         print("🧪 Sheet existing_data:", existing_data)
+#         # Try to load old data
+#         if existing_data and len(existing_data[0]) >= 1:
+#             df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+#             # Fix missing columns
+#             if "email" not in df_old.columns:
+#                 df_old["email"] = ""
+#             if "usage_count" not in df_old.columns:
+#                 df_old["usage_count"] = 0
+#             df_old["email"] = df_old["email"].str.strip().str.lower()
+#             df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
+#         else:
+#             df_old = pd.DataFrame(columns=["email", "usage_count"])
+#         # Step 3: Merge
+#         df_combined = pd.concat([df_old, df_new], ignore_index=True)
+#         df_combined = df_combined.groupby("email", as_index=False).sum()
+#         # Step 4: Write back
+#         sheet.clear()
+#         sheet.update([df_combined.columns.tolist()] + df_combined.astype(str).values.tolist())
+#         print("✅ Saved user usage to user_usage_log sheet.")
+#     except Exception as e:
+#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
+def save_user_usage(usage_dict):
+    try:
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("user_usage_log")
+        sheet = spreadsheet.sheet1
+        # Build new df
+        df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
+        df_new["email"] = df_new["email"].str.strip().str.lower()
+        df_new["usage_count"] = pd.to_numeric(df_new["usage_count"], errors="coerce").fillna(0).astype(int)
+        # Read existing data
+        existing_data = sheet.get_all_values()
+        if existing_data and len(existing_data[0]) >= 2:
+            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+            df_old["email"] = df_old["email"].str.strip().str.lower()
+            df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
+        else:
+            df_old = pd.DataFrame(columns=["email", "usage_count"])
+        # ✅ Overwrite specific emails only
+        df_old = df_old.set_index("email")
+        for email, count in usage_dict.items():
+            email = email.strip().lower()
+            df_old.loc[email, "usage_count"] = count
+        df_old = df_old.reset_index()
+        # Save
+        sheet.clear()
+        sheet.update([df_old.columns.tolist()] + df_old.astype(str).values.tolist())
+        print("✅ Saved user usage to user_usage_log sheet.")
+    except Exception as e:
+        print(f"❌ Failed to save user usage to Google Sheets: {e}")
+# def increment_usage(user_id, num_samples=1):
+#     usage = load_user_usage()
+#     if user_id not in usage:
+#         usage[user_id] = 0
+#     usage[user_id] += num_samples
+#     save_user_usage(usage)
+#     return usage[user_id]
+# def increment_usage(email: str, count: int):
+#     usage = load_user_usage()
+#     email_key = email.strip().lower()
+#     usage[email_key] = usage.get(email_key, 0) + count
+#     save_user_usage(usage)
+#     return usage[email_key]
+def increment_usage(email: str, count: int = 1):
+    usage, permitted = load_user_usage()
+    email_key = email.strip().lower()
+    #usage[email_key] = usage.get(email_key, 0) + count
+    current = usage.get(email_key, 0)
+    new_value = current + count
+    max_allowed = permitted.get(email_key) or 50
+    usage[email_key] = max(current, new_value)  # ✅ Prevent overwrite with lower
+    print(f"🧪 increment_usage saving: {email_key=} {current=} + {count=} => {usage[email_key]=}")
+    print("max allow is: ", max_allowed)
+    save_user_usage(usage)
+    return usage[email_key], max_allowed
+# run the batch
+def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
+                    stop_flag=None, output_file_path=None,
+                    limited_acc=50, yield_callback=None):
+    if user_email:
+      limited_acc += 10
+    accessions, error = extract_accessions_from_input(file, raw_text)
+    if error:
+        #return [], "", "", f"Error: {error}"
+        return [], f"Error: {error}", 0, "", ""
+    if resume_file:
+      accessions = get_incomplete_accessions(resume_file)
+    tmp_dir = tempfile.mkdtemp()
+    if not output_file_path:
+      if resume_file:
+        output_file_path = os.path.join(tmp_dir, resume_file)
+      else:
+        output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
+    all_rows = []
+    # all_summaries = []
+    # all_flags = []
+    progress_lines = []
+    warning = ""
+    if len(accessions) > limited_acc:
+      accessions = accessions[:limited_acc]
+      warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
+    for i, acc in enumerate(accessions):
+        if stop_flag and stop_flag.value:
+            line = f"🛑 Stopped at {acc} ({i+1}/{len(accessions)})"
+            progress_lines.append(line)
+            if yield_callback:
+              yield_callback(line)
+            print("🛑 User requested stop.")
+            break
+        print(f"[{i+1}/{len(accessions)}] Processing {acc}")
+        try:
+            # rows, summary, label, explain = summarize_results(acc)
+            rows = summarize_results(acc)
+            all_rows.extend(rows)
+            # all_summaries.append(f"**{acc}**\n{summary}")
+            # all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
+            #save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
+            save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
+            line = f"✅ Processed {acc} ({i+1}/{len(accessions)})"
+            progress_lines.append(line)
+            if yield_callback:
+              yield_callback(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
+        except Exception as e:
+            print(f"❌ Failed to process {acc}: {e}")
+            continue
+            #all_summaries.append(f"**{acc}**: Failed - {e}")
+        #progress_lines.append(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
+        limited_acc -= 1
+    """for row in all_rows:
+          source_column = row[2]  # Assuming the "Source" is in the 3rd column (index 2)
+          if source_column.startswith("http"):  # Check if the source is a URL
+              # Wrap it with HTML anchor tags to make it clickable
+              row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
+    if not warning:
+      warning = f"You only have {limited_acc} left"
+    if user_email.strip():
+        user_hash = hash_user_id(user_email)
+        total_queries = increment_usage(user_hash, len(all_rows))
+    else:
+        total_queries = 0
+    yield_callback("✅ Finished!")
+    # summary_text = "\n\n---\n\n".join(all_summaries)
+    # flag_text = "\n\n---\n\n".join(all_flags)
+    #return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
+    #return all_rows, gr.update(visible=True), gr.update(visible=False)
     return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning

mtdna_classifier.py CHANGED Viewed

@@ -1,707 +1,714 @@
-# mtDNA Location Classifier MVP (Google Colab)
-# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
-import os
-#import streamlit as st
-import subprocess
-import re
-from Bio import Entrez
-import fitz
-import spacy
-from spacy.cli import download
-from NER.PDF import pdf
-from NER.WordDoc import wordDoc
-from NER.html import extractHTML
-from NER.word2Vec import word2vec
-from transformers import pipeline
-import urllib.parse, requests
-from pathlib import Path
-from upgradeClassify import filter_context_for_sample, infer_location_for_sample
-# Set your email (required by NCBI Entrez)
-#Entrez.email = "[email protected]"
-import nltk
-nltk.download("stopwords")
-nltk.download("punkt")
-nltk.download('punkt_tab')
-# Step 1: Get PubMed ID from Accession using EDirect
-from Bio import Entrez, Medline
-import re
-Entrez.email = "[email protected]"
-# --- Helper Functions (Re-organized and Upgraded) ---
-def fetch_ncbi_metadata(accession_number):
-    """
-    Fetches metadata directly from NCBI GenBank using Entrez.
-    Includes robust error handling and improved field extraction.
-    Prioritizes location extraction from geo_loc_name, then notes, then other qualifiers.
-    Also attempts to extract ethnicity and sample_type (ancient/modern).
-    Args:
-        accession_number (str): The NCBI accession number (e.g., "ON792208").
-    Returns:
-        dict: A dictionary containing 'country', 'specific_location', 'ethnicity',
-              'sample_type', 'collection_date', 'isolate', 'title', 'doi', 'pubmed_id'.
-    """
-    Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
-    country = "unknown"
-    specific_location = "unknown"
-    ethnicity = "unknown"
-    sample_type = "unknown"
-    collection_date = "unknown"
-    isolate = "unknown"
-    title = "unknown"
-    doi = "unknown"
-    pubmed_id = None
-    all_feature = "unknown"
-    KNOWN_COUNTRIES = [
-        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan",
-        "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
-        "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Brazzaville)", "Congo (Kinshasa)", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia",
-        "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
-        "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
-        "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
-        "Kazakhstan", "Kenya", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
-        "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar",
-        "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman",
-        "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda",
-        "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
-        "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu",
-        "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam",
-        "Yemen", "Zambia", "Zimbabwe"
-    ]
-    COUNTRY_PATTERN = re.compile(r'\b(' + '|'.join(re.escape(c) for c in KNOWN_COUNTRIES) + r')\b', re.IGNORECASE)
-    try:
-        handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
-        record = Entrez.read(handle)
-        handle.close()
-        gb_seq = None
-        # Validate record structure: It should be a list with at least one element (a dict)
-        if isinstance(record, list) and len(record) > 0:
-            if isinstance(record[0], dict):
-                gb_seq = record[0]
-            else:
-                print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
-        else:
-            print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
-        # If gb_seq is still None, return defaults
-        if gb_seq is None:
-            return {"country": "unknown", "specific_location": "unknown", "ethnicity": "unknown",
-                    "sample_type": "unknown", "collection_date": "unknown", "isolate": "unknown",
-                    "title": "unknown", "doi": "unknown", "pubmed_id": None}
-        # If gb_seq is valid, proceed with extraction
-        collection_date = gb_seq.get("GBSeq_create-date","unknown")
-        references = gb_seq.get("GBSeq_references", [])
-        for ref in references:
-            if not pubmed_id:
-                pubmed_id = ref.get("GBReference_pubmed",None)
-            if title == "unknown":
-                title = ref.get("GBReference_title","unknown")
-            for xref in ref.get("GBReference_xref", []):
-                if xref.get("GBXref_dbname") == "doi":
-                    doi = xref.get("GBXref_id")
-                    break
-        features = gb_seq.get("GBSeq_feature-table", [])
-        context_for_flagging = "" # Accumulate text for ancient/modern detection
-        features_context = ""
-        for feature in features:
-            if feature.get("GBFeature_key") == "source":
-                feature_context = ""
-                qualifiers = feature.get("GBFeature_quals", [])
-                found_country = "unknown"
-                found_specific_location = "unknown"
-                found_ethnicity = "unknown"
-                temp_geo_loc_name = "unknown"
-                temp_note_origin_locality = "unknown"
-                temp_country_qual = "unknown"
-                temp_locality_qual = "unknown"
-                temp_collection_location_qual = "unknown"
-                temp_isolation_source_qual = "unknown"
-                temp_env_sample_qual = "unknown"
-                temp_pop_qual = "unknown"
-                temp_organism_qual = "unknown"
-                temp_specimen_qual = "unknown"
-                temp_strain_qual = "unknown"
-                for qual in qualifiers:
-                    qual_name = qual.get("GBQualifier_name")
-                    qual_value = qual.get("GBQualifier_value")
-                    feature_context += qual_name + ": " + qual_value +"\n"
-                    if qual_name == "collection_date":
-                        collection_date = qual_value
-                    elif qual_name == "isolate":
-                        isolate = qual_value
-                    elif qual_name == "population":
-                        temp_pop_qual = qual_value
-                    elif qual_name == "organism":
-                        temp_organism_qual = qual_value
-                    elif qual_name == "specimen_voucher" or qual_name == "specimen":
-                        temp_specimen_qual = qual_value
-                    elif qual_name == "strain":
-                        temp_strain_qual = qual_value
-                    elif qual_name == "isolation_source":
-                        temp_isolation_source_qual = qual_value
-                    elif qual_name == "environmental_sample":
-                        temp_env_sample_qual = qual_value
-                    if qual_name == "geo_loc_name": temp_geo_loc_name = qual_value
-                    elif qual_name == "note":
-                        if qual_value.startswith("origin_locality:"):
-                            temp_note_origin_locality = qual_value
-                        context_for_flagging += qual_value + " " # Capture all notes for flagging
-                    elif qual_name == "country": temp_country_qual = qual_value
-                    elif qual_name == "locality": temp_locality_qual = qual_value
-                    elif qual_name == "collection_location": temp_collection_location_qual = qual_value
-                # --- Aggregate all relevant info into context_for_flagging ---
-                context_for_flagging += f" {isolate} {temp_isolation_source_qual} {temp_specimen_qual} {temp_strain_qual} {temp_organism_qual} {temp_geo_loc_name} {temp_collection_location_qual} {temp_env_sample_qual}"
-                context_for_flagging = context_for_flagging.strip()
-                # --- Determine final country and specific_location based on priority ---
-                if temp_geo_loc_name != "unknown":
-                    parts = [p.strip() for p in temp_geo_loc_name.split(':')]
-                    if len(parts) > 1:
-                      found_specific_location = parts[-1]; found_country = parts[0]
-                    else: found_country = temp_geo_loc_name; found_specific_location = "unknown"
-                elif temp_note_origin_locality != "unknown":
-                    match = re.search(r"origin_locality:\s*(.*)", temp_note_origin_locality, re.IGNORECASE)
-                    if match:
-                        location_string = match.group(1).strip()
-                        parts = [p.strip() for p in location_string.split(':')]
-                        if len(parts) > 1: found_country = parts[-1]; found_specific_location = parts[0]
-                        else: found_country = location_string; found_specific_location = "unknown"
-                elif temp_locality_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_locality_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_locality_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_locality_qual; found_country = "unknown"
-                elif temp_collection_location_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_collection_location_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_collection_location_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_collection_location_qual; found_country = "unknown"
-                elif temp_isolation_source_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_isolation_source_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_isolation_source_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_isolation_source_qual; found_country = "unknown"
-                elif temp_env_sample_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_env_sample_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_env_sample_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_env_sample_qual; found_country = "unknown"
-                if found_country == "unknown" and temp_country_qual != "unknown":
-                     found_country_match = COUNTRY_PATTERN.search(temp_country_qual)
-                     if found_country_match: found_country = found_country_match.group(1)
-                country = found_country
-                specific_location = found_specific_location
-                # --- Determine final ethnicity ---
-                if temp_pop_qual != "unknown":
-                    found_ethnicity = temp_pop_qual
-                elif isolate != "unknown" and re.fullmatch(r'[A-Za-z\s\-]+', isolate) and get_country_from_text(isolate) == "unknown":
-                     found_ethnicity = isolate
-                elif context_for_flagging != "unknown": # Use the broader context for ethnicity patterns
-                    eth_match = re.search(r'(?:population|ethnicity|isolate source):\s*([A-Za-z\s\-]+)', context_for_flagging, re.IGNORECASE)
-                    if eth_match:
-                        found_ethnicity = eth_match.group(1).strip()
-                ethnicity = found_ethnicity
-                # --- Determine sample_type (ancient/modern) ---
-                if context_for_flagging:
-                    sample_type, explain = detect_ancient_flag(context_for_flagging)
-                features_context += feature_context + "\n"
-                break
-        if specific_location != "unknown" and specific_location.lower() == country.lower():
-            specific_location = "unknown"
-        if not features_context:  features_context = "unknown"
-        return {"country": country.lower(),
-                "specific_location": specific_location.lower(),
-                "ethnicity": ethnicity.lower(),
-                "sample_type": sample_type.lower(),
-                "collection_date": collection_date,
-                "isolate": isolate,
-                "title": title,
-                "doi": doi,
-                "pubmed_id": pubmed_id,
-                "all_features": features_context}
-    except Exception as e:
-        print(f"Error fetching NCBI data for {accession_number}: {e}")
-        return {"country": "unknown",
-                "specific_location": "unknown",
-                "ethnicity": "unknown",
-                "sample_type": "unknown",
-                "collection_date": "unknown",
-                "isolate": "unknown",
-                "title": "unknown",
-                "doi": "unknown",
-                "pubmed_id": None,
-                "all_features": "unknown"}
-# --- Helper function for country matching (re-defined from main code to be self-contained) ---
-_country_keywords = {
-    "thailand": "Thailand", "laos": "Laos", "cambodia": "Cambodia", "myanmar": "Myanmar",
-    "philippines": "Philippines", "indonesia": "Indonesia", "malaysia": "Malaysia",
-    "china": "China", "chinese": "China", "india": "India", "taiwan": "Taiwan",
-    "vietnam": "Vietnam", "russia": "Russia", "siberia": "Russia", "nepal": "Nepal",
-    "japan": "Japan", "sumatra": "Indonesia", "borneu": "Indonesia",
-    "yunnan": "China", "tibet": "China", "northern mindanao": "Philippines",
-    "west malaysia": "Malaysia", "north thailand": "Thailand", "central thailand": "Thailand",
-    "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
-    "central india": "India", "east india": "India", "northeast india": "India",
-    "south sibera": "Russia", "mongolia": "China", "beijing": "China", "south korea": "South Korea",
-    "north asia": "unknown", "southeast asia": "unknown", "east asia": "unknown"
-}
-def get_country_from_text(text):
-    text_lower = text.lower()
-    for keyword, country in _country_keywords.items():
-        if keyword in text_lower:
-            return country
-    return "unknown"
-# The result will be seen as manualLink for the function get_paper_text
-def search_google_custom(query, max_results=3):
-  # query should be the title from ncbi or paper/source title
-    GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
-    GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
-    endpoint = os.environ["SEARCH_ENDPOINT"]
-    params = {
-        "key": GOOGLE_CSE_API_KEY,
-        "cx": GOOGLE_CSE_CX,
-        "q": query,
-        "num": max_results
-    }
-    try:
-        response = requests.get(endpoint, params=params)
-        if response.status_code == 429:
-            print("Rate limit hit. Try again later.")
-            return []
-        response.raise_for_status()
-        data = response.json().get("items", [])
-        return [item.get("link") for item in data if item.get("link")]
-    except Exception as e:
-        print("Google CSE error:", e)
-        return []
-# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
-# Step 3.1: Extract Text
-# sub: download excel file
-def download_excel_file(url, save_path="temp.xlsx"):
-    if "view.officeapps.live.com" in url:
-        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
-        real_url = urllib.parse.unquote(parsed_url["src"][0])
-        response = requests.get(real_url)
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-        return save_path
-    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
-        response = requests.get(url)
-        response.raise_for_status()  # Raises error if download fails
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-        return save_path
-    else:
-        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
-        return url
-def get_paper_text(doi,id,manualLinks=None):
-  # create the temporary folder to contain the texts
-  folder_path = Path("data/"+str(id))
-  if not folder_path.exists():
-      cmd = f'mkdir data/{id}'
-      result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-      print("data/"+str(id) +" created.")
-  else:
-      print("data/"+str(id) +" already exists.")
-  saveLinkFolder = "data/"+id
-  link = 'https://doi.org/' + doi
-  '''textsToExtract = { "doiLink":"paperText"
-                        "file1.pdf":"text1",
-                        "file2.doc":"text2",
-                        "file3.xlsx":excelText3'''
-  textsToExtract = {}
-  # get the file to create listOfFile for each id
-  html = extractHTML.HTML("",link)
-  jsonSM = html.getSupMaterial()
-  text = ""
-  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
-  if manualLinks != None:
-    links += manualLinks
-  for l in links:
-    # get the main paper
-    name = l.split("/")[-1]
-    file_path = folder_path / name
-    if l == link:
-      text = html.getListSection()
-      textsToExtract[link] = text
-    elif l.endswith(".pdf"):
-      if file_path.is_file():
-          l = saveLinkFolder + "/" + name
-          print("File exists.")
-      p = pdf.PDF(l,saveLinkFolder,doi)
-      f = p.openPDFFile()
-      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
-      doc = fitz.open(pdf_path)
-      text = "\n".join([page.get_text() for page in doc])
-      textsToExtract[l] = text
-    elif l.endswith(".doc") or l.endswith(".docx"):
-      d = wordDoc.wordDoc(l,saveLinkFolder)
-      text = d.extractTextByPage()
-      textsToExtract[l] = text
-    elif l.split(".")[-1].lower() in "xlsx":
-      wc = word2vec.word2Vec()
-      # download excel file if it not downloaded yet
-      savePath = saveLinkFolder +"/"+ l.split("/")[-1]
-      excelPath = download_excel_file(l, savePath)
-      corpus = wc.tableTransformToCorpusText([],excelPath)
-      text = ''
-      for c in corpus:
-        para = corpus[c]
-        for words in para:
-          text += " ".join(words)
-      textsToExtract[l] = text
-  # delete folder after finishing getting text
-  #cmd = f'rm -r data/{id}'
-  #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-  return textsToExtract
-# Step 3.2: Extract context
-def extract_context(text, keyword, window=500):
-    # firstly try accession number
-    idx = text.find(keyword)
-    if idx == -1:
-        return "Sample ID not found."
-    return text[max(0, idx-window): idx+window]
-def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
-    if keep_if is None:
-        keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
-    outputs = ""
-    text = text.lower()
-    # If isolate is provided, prioritize paragraphs that mention it
-    # If isolate is provided, prioritize paragraphs that mention it
-    if accession and accession.lower() in text:
-        if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
-            outputs += extract_context(text, accession.lower(), window=700)
-    if isolate and isolate.lower() in text:
-        if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
-            outputs += extract_context(text, isolate.lower(), window=700)
-    for keyword in keep_if:
-        para = extract_context(text, keyword)
-        if para and para not in outputs:
-            outputs += para + "\n"
-    return outputs
-# Step 4: Classification for now (demo purposes)
-# 4.1: Using a HuggingFace model (question-answering)
-def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
-    try:
-        qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
-        result = qa({"context": context, "question": question})
-        return result.get("answer", "Unknown")
-    except Exception as e:
-        return f"Error: {str(e)}"
-# 4.2: Infer from haplogroup
-# Load pre-trained spaCy model for NER
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
-# Define the haplogroup-to-region mapping (simple rule-based)
-import csv
-def load_haplogroup_mapping(csv_path):
-    mapping = {}
-    with open(csv_path) as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            mapping[row["haplogroup"]] = [row["region"],row["source"]]
-    return mapping
-# Function to extract haplogroup from the text
-def extract_haplogroup(text):
-    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
-    if match:
-        submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
-        if submatch:
-            return submatch.group(0)
-        else:
-            return match.group(1)  # fallback
-    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
-    if fallback:
-        return fallback.group(1)
-    return None
-# Function to extract location based on NER
-def extract_location(text):
-    doc = nlp(text)
-    locations = []
-    for ent in doc.ents:
-        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
-            locations.append(ent.text)
-    return locations
-# Function to infer location from haplogroup
-def infer_location_from_haplogroup(haplogroup):
-  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
-  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
-# Function to classify the mtDNA sample
-def classify_mtDNA_sample_from_haplo(text):
-    # Extract haplogroup
-    haplogroup = extract_haplogroup(text)
-    # Extract location based on NER
-    locations = extract_location(text)
-    # Infer location based on haplogroup
-    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
-    return {
-        "source":sourceHaplo,
-        "locations_found_in_context": locations,
-        "haplogroup": haplogroup,
-        "inferred_location": inferred_location
-    }
-# 4.3 Get from available NCBI
-def infer_location_fromNCBI(accession):
-    try:
-        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
-        text = handle.read()
-        handle.close()
-        match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
-        if match:
-            return match.group(2), match.group(0)  # This is the value like "Brunei"
-        return "Not found", "Not found"
-    except Exception as e:
-        print("❌ Entrez error:", e)
-        return "Not found", "Not found"
-### ANCIENT/MODERN FLAG
-from Bio import Entrez
-import re
-def flag_ancient_modern(accession, textsToExtract, isolate=None):
-    """
-    Try to classify a sample as Ancient or Modern using:
-    1. NCBI accession (if available)
-    2. Supplementary text or context fallback
-    """
-    context = ""
-    label, explain = "", ""
-    try:
-        # Check if we can fetch metadata from NCBI using the accession
-        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
-        text = handle.read()
-        handle.close()
-        isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
-        if isolate_source:
-            context += isolate_source.group(0) + " "
-        specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
-        if specimen:
-            context += specimen.group(0) + " "
-        if context.strip():
-            label, explain = detect_ancient_flag(context)
-            if label!="Unknown":
-              return label, explain + " from NCBI\n(" + context + ")"
-        # If no useful NCBI metadata, check supplementary texts
-        if textsToExtract:
-            labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
-            for source in textsToExtract:
-                text_block = textsToExtract[source]
-                context = extract_relevant_paragraphs(text_block, accession, isolate=isolate)  # Reduce to informative paragraph(s)
-                label, explain = detect_ancient_flag(context)
-                if label == "Ancient":
-                    labels["ancient"][0] += 1
-                    labels["ancient"][1] += f"{source}:\n{explain}\n\n"
-                elif label == "Modern":
-                    labels["modern"][0] += 1
-                    labels["modern"][1] += f"{source}:\n{explain}\n\n"
-                else:
-                    labels["unknown"] += 1
-            if max(labels["modern"][0],labels["ancient"][0]) > 0:
-                if labels["modern"][0] > labels["ancient"][0]:
-                    return "Modern", labels["modern"][1]
-                else:
-                    return "Ancient", labels["ancient"][1]
-            else:
-              return "Unknown", "No strong keywords detected"
-        else:
-            print("No DOI or PubMed ID available for inference.")
-            return "", ""
-    except Exception as e:
-        print("Error:", e)
-        return "", ""
-def detect_ancient_flag(context_snippet):
-    context = context_snippet.lower()
-    ancient_keywords = [
-        "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
-        "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
-        "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
-        "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
-    ]
-    modern_keywords = [
-        "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
-        "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
-        "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
-        "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
-        "bioinformatic analysis", "samples from", "population genetics", "genome-wide data", "imr collection"
-    ]
-    ancient_hits = [k for k in ancient_keywords if k in context]
-    modern_hits = [k for k in modern_keywords if k in context]
-    if ancient_hits and not modern_hits:
-        return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
-    elif modern_hits and not ancient_hits:
-        return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
-    elif ancient_hits and modern_hits:
-        if len(ancient_hits) >= len(modern_hits):
-            return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
-        else:
-            return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
-    # Fallback to QA
-    answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
-    if answer.startswith("Error"):
-        return "Unknown", answer
-    if "ancient" in answer.lower():
-        return "Ancient", f"Leaning ancient based on QA: {answer}"
-    elif "modern" in answer.lower():
-        return "Modern", f"Leaning modern based on QA: {answer}"
-    else:
-        return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
-# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
-def classify_sample_location(accession):
-  outputs = {}
-  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
-  # Step 1: get pubmed id and isolate
-  pubmedID, isolate = get_info_from_accession(accession)
-  '''if not pubmedID:
-    return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
-  if not isolate:
-    isolate = "UNKNOWN_ISOLATE"
-  # Step 2: get doi
-  doi = get_doi_from_pubmed_id(pubmedID)
-  '''if not doi:
-    return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
-  # Step 3: get text
-  '''textsToExtract = { "doiLink":"paperText"
-                        "file1.pdf":"text1",
-                        "file2.doc":"text2",
-                        "file3.xlsx":excelText3'''
-  if doi and pubmedID:
-    textsToExtract = get_paper_text(doi,pubmedID)
-  else: textsToExtract = {}
-  '''if not textsToExtract:
-    return {"error": f"No texts extracted for DOI {doi}"}'''
-  if isolate not in [None, "UNKNOWN_ISOLATE"]:
-    label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
-  else:
-    label, explain = flag_ancient_modern(accession,textsToExtract)
-  # Step 4: prediction
-  outputs[accession] = {}
-  outputs[isolate] = {}
-  # 4.0 Infer from NCBI
-  location, outputNCBI = infer_location_fromNCBI(accession)
-  NCBI_result = {
-      "source": "NCBI",
-      "sample_id": accession,
-      "predicted_location": location,
-      "context_snippet": outputNCBI}
-  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
-  if textsToExtract:
-    long_text = ""
-    for key in textsToExtract:
-      text = textsToExtract[key]
-      # try accession number first
-      outputs[accession][key] = {}
-      keyword = accession
-      context = extract_context(text, keyword, window=500)
-      # 4.1: Using a HuggingFace model (question-answering)
-      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
-      qa_result = {
-          "source": key,
-          "sample_id": keyword,
-          "predicted_location": location,
-          "context_snippet": context
-      }
-      outputs[keyword][key]["QAModel"] = qa_result
-      # 4.2: Infer from haplogroup
-      haplo_result = classify_mtDNA_sample_from_haplo(context)
-      outputs[keyword][key]["haplogroup"] = haplo_result
-      # try isolate
-      keyword = isolate
-      outputs[isolate][key] = {}
-      context = extract_context(text, keyword, window=500)
-      # 4.1.1: Using a HuggingFace model (question-answering)
-      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
-      qa_result = {
-          "source": key,
-          "sample_id": keyword,
-          "predicted_location": location,
-          "context_snippet": context
-      }
-      outputs[keyword][key]["QAModel"] = qa_result
-      # 4.2.1: Infer from haplogroup
-      haplo_result = classify_mtDNA_sample_from_haplo(context)
-      outputs[keyword][key]["haplogroup"] = haplo_result
-      # add long text
-      long_text += text + ". \n"
-    # 4.3: UpgradeClassify
-    # try sample_id as accession number
-    sample_id = accession
-    if sample_id:
-      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
-      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
-      if locations!="No clear location found in top matches":
-        outputs[sample_id]["upgradeClassifier"] = {}
-        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
-          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
-          "sample_id": sample_id,
-          "predicted_location": ", ".join(locations),
-          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
-        }
-    # try sample_id as isolate name
-    sample_id = isolate
-    if sample_id:
-      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
-      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
-      if locations!="No clear location found in top matches":
-        outputs[sample_id]["upgradeClassifier"] = {}
-        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
-          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
-          "sample_id": sample_id,
-          "predicted_location": ", ".join(locations),
-          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
-        }
   return outputs, label, explain

+# mtDNA Location Classifier MVP (Google Colab)
+# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
+import os
+#import streamlit as st
+import subprocess
+import re
+from Bio import Entrez
+import fitz
+import spacy
+from spacy.cli import download
+from NER.PDF import pdf
+from NER.WordDoc import wordDoc
+from NER.html import extractHTML
+from NER.word2Vec import word2vec
+from transformers import pipeline
+import urllib.parse, requests
+from pathlib import Path
+from upgradeClassify import filter_context_for_sample, infer_location_for_sample
+# Set your email (required by NCBI Entrez)
+#Entrez.email = "[email protected]"
+import nltk
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download('punkt_tab')
+# Step 1: Get PubMed ID from Accession using EDirect
+from Bio import Entrez, Medline
+import re
+Entrez.email = "[email protected]"
+# --- Helper Functions (Re-organized and Upgraded) ---
+def fetch_ncbi_metadata(accession_number):
+    """
+    Fetches metadata directly from NCBI GenBank using Entrez.
+    Includes robust error handling and improved field extraction.
+    Prioritizes location extraction from geo_loc_name, then notes, then other qualifiers.
+    Also attempts to extract ethnicity and sample_type (ancient/modern).
+    Args:
+        accession_number (str): The NCBI accession number (e.g., "ON792208").
+    Returns:
+        dict: A dictionary containing 'country', 'specific_location', 'ethnicity',
+              'sample_type', 'collection_date', 'isolate', 'title', 'doi', 'pubmed_id'.
+    """
+    Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
+    country = "unknown"
+    specific_location = "unknown"
+    ethnicity = "unknown"
+    sample_type = "unknown"
+    collection_date = "unknown"
+    isolate = "unknown"
+    title = "unknown"
+    doi = "unknown"
+    pubmed_id = None
+    all_feature = "unknown"
+    KNOWN_COUNTRIES = [
+        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan",
+        "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
+        "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Brazzaville)", "Congo (Kinshasa)", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia",
+        "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
+        "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
+        "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
+        "Kazakhstan", "Kenya", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
+        "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar",
+        "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman",
+        "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda",
+        "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
+        "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu",
+        "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam",
+        "Yemen", "Zambia", "Zimbabwe"
+    ]
+    COUNTRY_PATTERN = re.compile(r'\b(' + '|'.join(re.escape(c) for c in KNOWN_COUNTRIES) + r')\b', re.IGNORECASE)
+    try:
+        handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
+        record = Entrez.read(handle)
+        handle.close()
+        gb_seq = None
+        # Validate record structure: It should be a list with at least one element (a dict)
+        if isinstance(record, list) and len(record) > 0:
+            if isinstance(record[0], dict):
+                gb_seq = record[0]
+            else:
+                print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
+        else:
+            print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
+        # If gb_seq is still None, return defaults
+        if gb_seq is None:
+            return {"country": "unknown",
+                "specific_location": "unknown",
+                "ethnicity": "unknown",
+                "sample_type": "unknown",
+                "collection_date": "unknown",
+                "isolate": "unknown",
+                "title": "unknown",
+                "doi": "unknown",
+                "pubmed_id": None,
+                "all_features": "unknown"}
+        # If gb_seq is valid, proceed with extraction
+        collection_date = gb_seq.get("GBSeq_create-date","unknown")
+        references = gb_seq.get("GBSeq_references", [])
+        for ref in references:
+            if not pubmed_id:
+                pubmed_id = ref.get("GBReference_pubmed",None)
+            if title == "unknown":
+                title = ref.get("GBReference_title","unknown")
+            for xref in ref.get("GBReference_xref", []):
+                if xref.get("GBXref_dbname") == "doi":
+                    doi = xref.get("GBXref_id")
+                    break
+        features = gb_seq.get("GBSeq_feature-table", [])
+        context_for_flagging = "" # Accumulate text for ancient/modern detection
+        features_context = ""
+        for feature in features:
+            if feature.get("GBFeature_key") == "source":
+                feature_context = ""
+                qualifiers = feature.get("GBFeature_quals", [])
+                found_country = "unknown"
+                found_specific_location = "unknown"
+                found_ethnicity = "unknown"
+                temp_geo_loc_name = "unknown"
+                temp_note_origin_locality = "unknown"
+                temp_country_qual = "unknown"
+                temp_locality_qual = "unknown"
+                temp_collection_location_qual = "unknown"
+                temp_isolation_source_qual = "unknown"
+                temp_env_sample_qual = "unknown"
+                temp_pop_qual = "unknown"
+                temp_organism_qual = "unknown"
+                temp_specimen_qual = "unknown"
+                temp_strain_qual = "unknown"
+                for qual in qualifiers:
+                    qual_name = qual.get("GBQualifier_name")
+                    qual_value = qual.get("GBQualifier_value")
+                    feature_context += qual_name + ": " + qual_value +"\n"
+                    if qual_name == "collection_date":
+                        collection_date = qual_value
+                    elif qual_name == "isolate":
+                        isolate = qual_value
+                    elif qual_name == "population":
+                        temp_pop_qual = qual_value
+                    elif qual_name == "organism":
+                        temp_organism_qual = qual_value
+                    elif qual_name == "specimen_voucher" or qual_name == "specimen":
+                        temp_specimen_qual = qual_value
+                    elif qual_name == "strain":
+                        temp_strain_qual = qual_value
+                    elif qual_name == "isolation_source":
+                        temp_isolation_source_qual = qual_value
+                    elif qual_name == "environmental_sample":
+                        temp_env_sample_qual = qual_value
+                    if qual_name == "geo_loc_name": temp_geo_loc_name = qual_value
+                    elif qual_name == "note":
+                        if qual_value.startswith("origin_locality:"):
+                            temp_note_origin_locality = qual_value
+                        context_for_flagging += qual_value + " " # Capture all notes for flagging
+                    elif qual_name == "country": temp_country_qual = qual_value
+                    elif qual_name == "locality": temp_locality_qual = qual_value
+                    elif qual_name == "collection_location": temp_collection_location_qual = qual_value
+                # --- Aggregate all relevant info into context_for_flagging ---
+                context_for_flagging += f" {isolate} {temp_isolation_source_qual} {temp_specimen_qual} {temp_strain_qual} {temp_organism_qual} {temp_geo_loc_name} {temp_collection_location_qual} {temp_env_sample_qual}"
+                context_for_flagging = context_for_flagging.strip()
+                # --- Determine final country and specific_location based on priority ---
+                if temp_geo_loc_name != "unknown":
+                    parts = [p.strip() for p in temp_geo_loc_name.split(':')]
+                    if len(parts) > 1:
+                      found_specific_location = parts[-1]; found_country = parts[0]
+                    else: found_country = temp_geo_loc_name; found_specific_location = "unknown"
+                elif temp_note_origin_locality != "unknown":
+                    match = re.search(r"origin_locality:\s*(.*)", temp_note_origin_locality, re.IGNORECASE)
+                    if match:
+                        location_string = match.group(1).strip()
+                        parts = [p.strip() for p in location_string.split(':')]
+                        if len(parts) > 1: found_country = parts[-1]; found_specific_location = parts[0]
+                        else: found_country = location_string; found_specific_location = "unknown"
+                elif temp_locality_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_locality_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_locality_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_locality_qual; found_country = "unknown"
+                elif temp_collection_location_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_collection_location_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_collection_location_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_collection_location_qual; found_country = "unknown"
+                elif temp_isolation_source_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_isolation_source_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_isolation_source_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_isolation_source_qual; found_country = "unknown"
+                elif temp_env_sample_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_env_sample_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_env_sample_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_env_sample_qual; found_country = "unknown"
+                if found_country == "unknown" and temp_country_qual != "unknown":
+                     found_country_match = COUNTRY_PATTERN.search(temp_country_qual)
+                     if found_country_match: found_country = found_country_match.group(1)
+                country = found_country
+                specific_location = found_specific_location
+                # --- Determine final ethnicity ---
+                if temp_pop_qual != "unknown":
+                    found_ethnicity = temp_pop_qual
+                elif isolate != "unknown" and re.fullmatch(r'[A-Za-z\s\-]+', isolate) and get_country_from_text(isolate) == "unknown":
+                     found_ethnicity = isolate
+                elif context_for_flagging != "unknown": # Use the broader context for ethnicity patterns
+                    eth_match = re.search(r'(?:population|ethnicity|isolate source):\s*([A-Za-z\s\-]+)', context_for_flagging, re.IGNORECASE)
+                    if eth_match:
+                        found_ethnicity = eth_match.group(1).strip()
+                ethnicity = found_ethnicity
+                # --- Determine sample_type (ancient/modern) ---
+                if context_for_flagging:
+                    sample_type, explain = detect_ancient_flag(context_for_flagging)
+                features_context += feature_context + "\n"
+                break
+        if specific_location != "unknown" and specific_location.lower() == country.lower():
+            specific_location = "unknown"
+        if not features_context:  features_context = "unknown"
+        return {"country": country.lower(),
+                "specific_location": specific_location.lower(),
+                "ethnicity": ethnicity.lower(),
+                "sample_type": sample_type.lower(),
+                "collection_date": collection_date,
+                "isolate": isolate,
+                "title": title,
+                "doi": doi,
+                "pubmed_id": pubmed_id,
+                "all_features": features_context}
+    except:
+        print(f"Error fetching NCBI data for {accession_number}")
+        return {"country": "unknown",
+                "specific_location": "unknown",
+                "ethnicity": "unknown",
+                "sample_type": "unknown",
+                "collection_date": "unknown",
+                "isolate": "unknown",
+                "title": "unknown",
+                "doi": "unknown",
+                "pubmed_id": None,
+                "all_features": "unknown"}
+# --- Helper function for country matching (re-defined from main code to be self-contained) ---
+_country_keywords = {
+    "thailand": "Thailand", "laos": "Laos", "cambodia": "Cambodia", "myanmar": "Myanmar",
+    "philippines": "Philippines", "indonesia": "Indonesia", "malaysia": "Malaysia",
+    "china": "China", "chinese": "China", "india": "India", "taiwan": "Taiwan",
+    "vietnam": "Vietnam", "russia": "Russia", "siberia": "Russia", "nepal": "Nepal",
+    "japan": "Japan", "sumatra": "Indonesia", "borneu": "Indonesia",
+    "yunnan": "China", "tibet": "China", "northern mindanao": "Philippines",
+    "west malaysia": "Malaysia", "north thailand": "Thailand", "central thailand": "Thailand",
+    "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
+    "central india": "India", "east india": "India", "northeast india": "India",
+    "south sibera": "Russia", "mongolia": "China", "beijing": "China", "south korea": "South Korea",
+    "north asia": "unknown", "southeast asia": "unknown", "east asia": "unknown"
+}
+def get_country_from_text(text):
+    text_lower = text.lower()
+    for keyword, country in _country_keywords.items():
+        if keyword in text_lower:
+            return country
+    return "unknown"
+# The result will be seen as manualLink for the function get_paper_text
+def search_google_custom(query, max_results=3):
+  # query should be the title from ncbi or paper/source title
+    GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
+    GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
+    endpoint = os.environ["SEARCH_ENDPOINT"]
+    params = {
+        "key": GOOGLE_CSE_API_KEY,
+        "cx": GOOGLE_CSE_CX,
+        "q": query,
+        "num": max_results
+    }
+    try:
+        response = requests.get(endpoint, params=params)
+        if response.status_code == 429:
+            print("Rate limit hit. Try again later.")
+            return []
+        response.raise_for_status()
+        data = response.json().get("items", [])
+        return [item.get("link") for item in data if item.get("link")]
+    except Exception as e:
+        print("Google CSE error:", e)
+        return []
+# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
+# Step 3.1: Extract Text
+# sub: download excel file
+def download_excel_file(url, save_path="temp.xlsx"):
+    if "view.officeapps.live.com" in url:
+        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+        real_url = urllib.parse.unquote(parsed_url["src"][0])
+        response = requests.get(real_url)
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
+        response = requests.get(url)
+        response.raise_for_status()  # Raises error if download fails
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    else:
+        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
+        return url
+def get_paper_text(doi,id,manualLinks=None):
+  # create the temporary folder to contain the texts
+  folder_path = Path("data/"+str(id))
+  if not folder_path.exists():
+      cmd = f'mkdir data/{id}'
+      result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      print("data/"+str(id) +" created.")
+  else:
+      print("data/"+str(id) +" already exists.")
+  saveLinkFolder = "data/"+id
+  link = 'https://doi.org/' + doi
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  textsToExtract = {}
+  # get the file to create listOfFile for each id
+  html = extractHTML.HTML("",link)
+  jsonSM = html.getSupMaterial()
+  text = ""
+  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
+  if manualLinks != None:
+    links += manualLinks
+  for l in links:
+    # get the main paper
+    name = l.split("/")[-1]
+    file_path = folder_path / name
+    if l == link:
+      text = html.getListSection()
+      textsToExtract[link] = text
+    elif l.endswith(".pdf"):
+      if file_path.is_file():
+          l = saveLinkFolder + "/" + name
+          print("File exists.")
+      p = pdf.PDF(l,saveLinkFolder,doi)
+      f = p.openPDFFile()
+      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
+      doc = fitz.open(pdf_path)
+      text = "\n".join([page.get_text() for page in doc])
+      textsToExtract[l] = text
+    elif l.endswith(".doc") or l.endswith(".docx"):
+      d = wordDoc.wordDoc(l,saveLinkFolder)
+      text = d.extractTextByPage()
+      textsToExtract[l] = text
+    elif l.split(".")[-1].lower() in "xlsx":
+      wc = word2vec.word2Vec()
+      # download excel file if it not downloaded yet
+      savePath = saveLinkFolder +"/"+ l.split("/")[-1]
+      excelPath = download_excel_file(l, savePath)
+      corpus = wc.tableTransformToCorpusText([],excelPath)
+      text = ''
+      for c in corpus:
+        para = corpus[c]
+        for words in para:
+          text += " ".join(words)
+      textsToExtract[l] = text
+  # delete folder after finishing getting text
+  #cmd = f'rm -r data/{id}'
+  #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  return textsToExtract
+# Step 3.2: Extract context
+def extract_context(text, keyword, window=500):
+    # firstly try accession number
+    idx = text.find(keyword)
+    if idx == -1:
+        return "Sample ID not found."
+    return text[max(0, idx-window): idx+window]
+def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
+    if keep_if is None:
+        keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
+    outputs = ""
+    text = text.lower()
+    # If isolate is provided, prioritize paragraphs that mention it
+    # If isolate is provided, prioritize paragraphs that mention it
+    if accession and accession.lower() in text:
+        if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
+            outputs += extract_context(text, accession.lower(), window=700)
+    if isolate and isolate.lower() in text:
+        if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
+            outputs += extract_context(text, isolate.lower(), window=700)
+    for keyword in keep_if:
+        para = extract_context(text, keyword)
+        if para and para not in outputs:
+            outputs += para + "\n"
+    return outputs
+# Step 4: Classification for now (demo purposes)
+# 4.1: Using a HuggingFace model (question-answering)
+def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
+    try:
+        qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+        result = qa({"context": context, "question": question})
+        return result.get("answer", "Unknown")
+    except Exception as e:
+        return f"Error: {str(e)}"
+# 4.2: Infer from haplogroup
+# Load pre-trained spaCy model for NER
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# Define the haplogroup-to-region mapping (simple rule-based)
+import csv
+def load_haplogroup_mapping(csv_path):
+    mapping = {}
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            mapping[row["haplogroup"]] = [row["region"],row["source"]]
+    return mapping
+# Function to extract haplogroup from the text
+def extract_haplogroup(text):
+    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
+    if match:
+        submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
+        if submatch:
+            return submatch.group(0)
+        else:
+            return match.group(1)  # fallback
+    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
+    if fallback:
+        return fallback.group(1)
+    return None
+# Function to extract location based on NER
+def extract_location(text):
+    doc = nlp(text)
+    locations = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
+            locations.append(ent.text)
+    return locations
+# Function to infer location from haplogroup
+def infer_location_from_haplogroup(haplogroup):
+  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
+  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
+# Function to classify the mtDNA sample
+def classify_mtDNA_sample_from_haplo(text):
+    # Extract haplogroup
+    haplogroup = extract_haplogroup(text)
+    # Extract location based on NER
+    locations = extract_location(text)
+    # Infer location based on haplogroup
+    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
+    return {
+        "source":sourceHaplo,
+        "locations_found_in_context": locations,
+        "haplogroup": haplogroup,
+        "inferred_location": inferred_location
+    }
+# 4.3 Get from available NCBI
+def infer_location_fromNCBI(accession):
+    try:
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
+        if match:
+            return match.group(2), match.group(0)  # This is the value like "Brunei"
+        return "Not found", "Not found"
+    except Exception as e:
+        print("❌ Entrez error:", e)
+        return "Not found", "Not found"
+### ANCIENT/MODERN FLAG
+from Bio import Entrez
+import re
+def flag_ancient_modern(accession, textsToExtract, isolate=None):
+    """
+    Try to classify a sample as Ancient or Modern using:
+    1. NCBI accession (if available)
+    2. Supplementary text or context fallback
+    """
+    context = ""
+    label, explain = "", ""
+    try:
+        # Check if we can fetch metadata from NCBI using the accession
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
+        if isolate_source:
+            context += isolate_source.group(0) + " "
+        specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
+        if specimen:
+            context += specimen.group(0) + " "
+        if context.strip():
+            label, explain = detect_ancient_flag(context)
+            if label!="Unknown":
+              return label, explain + " from NCBI\n(" + context + ")"
+        # If no useful NCBI metadata, check supplementary texts
+        if textsToExtract:
+            labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
+            for source in textsToExtract:
+                text_block = textsToExtract[source]
+                context = extract_relevant_paragraphs(text_block, accession, isolate=isolate)  # Reduce to informative paragraph(s)
+                label, explain = detect_ancient_flag(context)
+                if label == "Ancient":
+                    labels["ancient"][0] += 1
+                    labels["ancient"][1] += f"{source}:\n{explain}\n\n"
+                elif label == "Modern":
+                    labels["modern"][0] += 1
+                    labels["modern"][1] += f"{source}:\n{explain}\n\n"
+                else:
+                    labels["unknown"] += 1
+            if max(labels["modern"][0],labels["ancient"][0]) > 0:
+                if labels["modern"][0] > labels["ancient"][0]:
+                    return "Modern", labels["modern"][1]
+                else:
+                    return "Ancient", labels["ancient"][1]
+            else:
+              return "Unknown", "No strong keywords detected"
+        else:
+            print("No DOI or PubMed ID available for inference.")
+            return "", ""
+    except Exception as e:
+        print("Error:", e)
+        return "", ""
+def detect_ancient_flag(context_snippet):
+    context = context_snippet.lower()
+    ancient_keywords = [
+        "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
+        "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
+        "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
+        "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
+    ]
+    modern_keywords = [
+        "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
+        "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
+        "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
+        "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
+        "bioinformatic analysis", "samples from", "population genetics", "genome-wide data", "imr collection"
+    ]
+    ancient_hits = [k for k in ancient_keywords if k in context]
+    modern_hits = [k for k in modern_keywords if k in context]
+    if ancient_hits and not modern_hits:
+        return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
+    elif modern_hits and not ancient_hits:
+        return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
+    elif ancient_hits and modern_hits:
+        if len(ancient_hits) >= len(modern_hits):
+            return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
+        else:
+            return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
+    # Fallback to QA
+    answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
+    if answer.startswith("Error"):
+        return "Unknown", answer
+    if "ancient" in answer.lower():
+        return "Ancient", f"Leaning ancient based on QA: {answer}"
+    elif "modern" in answer.lower():
+        return "Modern", f"Leaning modern based on QA: {answer}"
+    else:
+        return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
+# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
+def classify_sample_location(accession):
+  outputs = {}
+  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
+  # Step 1: get pubmed id and isolate
+  pubmedID, isolate = get_info_from_accession(accession)
+  '''if not pubmedID:
+    return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
+  if not isolate:
+    isolate = "UNKNOWN_ISOLATE"
+  # Step 2: get doi
+  doi = get_doi_from_pubmed_id(pubmedID)
+  '''if not doi:
+    return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
+  # Step 3: get text
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  if doi and pubmedID:
+    textsToExtract = get_paper_text(doi,pubmedID)
+  else: textsToExtract = {}
+  '''if not textsToExtract:
+    return {"error": f"No texts extracted for DOI {doi}"}'''
+  if isolate not in [None, "UNKNOWN_ISOLATE"]:
+    label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
+  else:
+    label, explain = flag_ancient_modern(accession,textsToExtract)
+  # Step 4: prediction
+  outputs[accession] = {}
+  outputs[isolate] = {}
+  # 4.0 Infer from NCBI
+  location, outputNCBI = infer_location_fromNCBI(accession)
+  NCBI_result = {
+      "source": "NCBI",
+      "sample_id": accession,
+      "predicted_location": location,
+      "context_snippet": outputNCBI}
+  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
+  if textsToExtract:
+    long_text = ""
+    for key in textsToExtract:
+      text = textsToExtract[key]
+      # try accession number first
+      outputs[accession][key] = {}
+      keyword = accession
+      context = extract_context(text, keyword, window=500)
+      # 4.1: Using a HuggingFace model (question-answering)
+      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+      qa_result = {
+          "source": key,
+          "sample_id": keyword,
+          "predicted_location": location,
+          "context_snippet": context
+      }
+      outputs[keyword][key]["QAModel"] = qa_result
+      # 4.2: Infer from haplogroup
+      haplo_result = classify_mtDNA_sample_from_haplo(context)
+      outputs[keyword][key]["haplogroup"] = haplo_result
+      # try isolate
+      keyword = isolate
+      outputs[isolate][key] = {}
+      context = extract_context(text, keyword, window=500)
+      # 4.1.1: Using a HuggingFace model (question-answering)
+      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+      qa_result = {
+          "source": key,
+          "sample_id": keyword,
+          "predicted_location": location,
+          "context_snippet": context
+      }
+      outputs[keyword][key]["QAModel"] = qa_result
+      # 4.2.1: Infer from haplogroup
+      haplo_result = classify_mtDNA_sample_from_haplo(context)
+      outputs[keyword][key]["haplogroup"] = haplo_result
+      # add long text
+      long_text += text + ". \n"
+    # 4.3: UpgradeClassify
+    # try sample_id as accession number
+    sample_id = accession
+    if sample_id:
+      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
+      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
+      if locations!="No clear location found in top matches":
+        outputs[sample_id]["upgradeClassifier"] = {}
+        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
+          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
+          "sample_id": sample_id,
+          "predicted_location": ", ".join(locations),
+          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
+        }
+    # try sample_id as isolate name
+    sample_id = isolate
+    if sample_id:
+      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
+      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
+      if locations!="No clear location found in top matches":
+        outputs[sample_id]["upgradeClassifier"] = {}
+        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
+          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
+          "sample_id": sample_id,
+          "predicted_location": ", ".join(locations),
+          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
+        }
   return outputs, label, explain

offer.html ADDED Viewed

	@@ -0,0 +1,77 @@

+<div style="font-family: sans-serif; line-height: 1.6;">
+  <h1>mtDNA Location Classifier</h1>
+  <h2>Publish faster. Analyze smarter.</h2>
+  <p>Are you dealing with <strong>incomplete mtDNA metadata</strong> (like country, ethnicity, sample type)?<br>
+  This tool helps researchers like you generate <strong>clean, structured labels</strong> — ready to use for your paper.</p>
+  <hr>
+  <h2> What You’ll Get:</h2>
+  <ul>
+    <li>Inference from sequence ID alone</li>
+    <li>Handles hard edge cases</li>
+    <li>Clear sample type, country, and more (ethnicity, phenotype, etc.)</li>
+    <li>Excel export with citations</li>
+    <li>Feedback-based refund policy</li>
+  </ul>
+  <hr>
+  <h2>Free Tier</h2>
+  <ul>
+    <li>30 free samples — no email needed</li>
+    <li>+20 bonus samples + Excel file when you enter your email</li>
+    <li>Don’t like the result? Tell us why on the report — we won’t count the bad ones (email required)</li>
+  </ul>
+  <hr>
+  <h2>Pricing — Pay As You Go (DIY)</h2>
+  <table border="1" cellpadding="6" cellspacing="0">
+    <thead>
+      <tr>
+        <th>Case Type</th>
+        <th>Price/Sample</th>
+        <th>Output</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr><td>Normal</td><td>$0.10</td><td>Sample Type + Country</td></tr>
+      <tr><td>Edge</td><td>$1.00</td><td>Sample Type + Country</td></tr>
+      <tr><td>Niche</td><td>$2.00</td><td>Sample Type + Country + 1 Custom Label</td></tr>
+    </tbody>
+  </table>
+  <hr>
+  <h2>Batch Discount (1000+ Samples)</h2>
+  <ul>
+    <li><strong>Normal Output</strong> → $100 total ($0.10/sample)<br>Unsatisfied samples? We’ll refund them.</li>
+    <li><strong>Niche Output</strong> → $500 total ($0.50/sample)<br>Includes an extra label like ethnicity or phenotype.</li>
+  </ul>
+  <hr>
+  <h2>Early User Bonus (Limited!)</h2>
+  <p>Are you one of our <strong>first 10 paying users</strong>?<br>
+  Just type <code>early_user</code> in your email.</p>
+  <p>You'll get <strong>20% lifetime discount</strong> on every plan — forever.<br>
+  We’ll apply this automatically so you don’t have to calculate anything.</p>
+  <hr>
+  <h2>Our Mission</h2>
+  <p>Give clean, high-quality, open-access biological datasets that save researchers time and improve scientific reproducibility.<br>
+  Build the world's clean, AI-driven open metadata source for biological research.</p>
+  <hr>
+  <h2>Try It Now</h2>
+  <p>Paste your sequence ID on our demo:<br>
+  <a href="https://huggingface.co/spaces/VyLala/mtDNALocation" target="_blank">Try the Classifier</a></p>
+  <p>Need help or bulk analysis?<br>
+  <a href="mailto:[email protected]" target="_blank">Contact Us</a></p>
+</div>

pipeline.py CHANGED Viewed

@@ -1,649 +1,649 @@
-# test1: MJ17 direct
-# test2: "A1YU101" thailand cross-ref
-# test3: "EBK109" thailand cross-ref
-# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
-import data_preprocess
-import model
-import mtdna_classifier
-#import app
-import smart_fallback
-import pandas as pd
-from pathlib import Path
-import subprocess
-from NER.html import extractHTML
-import os
-import google.generativeai as genai
-import re
-import standardize_location
-# Helper functions in for this pipeline
-# Track time
-import time
-import multiprocessing
-import gspread
-from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
-from google.oauth2.service_account import Credentials
-from oauth2client.service_account import ServiceAccountCredentials
-import io
-import json
-#––– Authentication setup –––
-GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
-GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
-GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
-GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
-drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
-def get_or_create_drive_folder(name, parent_id=None):
-    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
-    if parent_id:
-        query += f" and '{parent_id}' in parents"
-    results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
-    items = results.get("files", [])
-    if items:
-        return items[0]["id"]
-    file_metadata = {
-        "name": name,
-        "mimeType": "application/vnd.google-apps.folder"
-    }
-    if parent_id:
-        file_metadata["parents"] = [parent_id]
-    file = drive_service.files().create(body=file_metadata, fields="id").execute()
-    return file["id"]
-def find_drive_file(filename, parent_id):
-    """
-    Checks if a file with the given name exists inside the specified Google Drive folder.
-    Returns the file ID if found, else None.
-    """
-    query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
-    results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
-    files = results.get('files', [])
-    if files:
-        return files[0]["id"]
-    return None
-# def upload_file_to_drive(local_path, remote_name, folder_id):
-#     file_metadata = {"name": remote_name, "parents": [folder_id]}
-#     media = MediaFileUpload(local_path, resumable=True)
-#     existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
-#     if existing:
-#         drive_service.files().delete(fileId=existing[0]["id"]).execute()
-#     file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
-#     result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
-#     if not result.get("files"):
-#         print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
-#     else:
-#         print(f"✅ Verified upload: {remote_name}")
-#     return file["id"]
-def upload_file_to_drive(local_path, remote_name, folder_id):
-    try:
-        if not os.path.exists(local_path):
-            raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
-        # Delete existing file on Drive if present
-        existing = drive_service.files().list(
-            q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
-            fields="files(id)"
-        ).execute().get("files", [])
-        if existing:
-            drive_service.files().delete(fileId=existing[0]["id"]).execute()
-            print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}")
-        file_metadata = {"name": remote_name, "parents": [folder_id]}
-        media = MediaFileUpload(local_path, resumable=True)
-        file = drive_service.files().create(
-            body=file_metadata,
-            media_body=media,
-            fields="id"
-        ).execute()
-        print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
-        return file["id"]
-    except Exception as e:
-        print(f"❌ Error during upload: {e}")
-        return None
-def download_file_from_drive(remote_name, folder_id, local_path):
-    results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
-    files = results.get("files", [])
-    if not files:
-        return False
-    file_id = files[0]["id"]
-    request = drive_service.files().get_media(fileId=file_id)
-    fh = io.FileIO(local_path, 'wb')
-    downloader = MediaIoBaseDownload(fh, request)
-    done = False
-    while not done:
-        _, done = downloader.next_chunk()
-    return True
-def download_drive_file_content(file_id):
-    request = drive_service.files().get_media(fileId=file_id)
-    fh = io.BytesIO()
-    downloader = MediaIoBaseDownload(fh, request)
-    done = False
-    while not done:
-        _, done = downloader.next_chunk()
-    fh.seek(0)
-    return fh.read().decode("utf-8")
-# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
-#     """
-#     Runs `func` with timeout in seconds. Kills if it exceeds.
-#     Returns: (success, result or None)
-#     """
-#     def wrapper(q, *args, **kwargs):
-#         try:
-#             q.put(func(*args, **kwargs))
-#         except Exception as e:
-#             q.put(e)
-#     q = multiprocessing.Queue()
-#     p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
-#     p.start()
-#     p.join(timeout)
-#     if p.is_alive():
-#         p.terminate()
-#         p.join()
-#         print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
-#         return False, None
-#     else:
-#         result = q.get()
-#         if isinstance(result, Exception):
-#             raise result
-#         return True, result
-def run_with_timeout(func, args=(), kwargs={}, timeout=30):
-    import concurrent.futures
-    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
-        future = executor.submit(func, *args, **kwargs)
-        try:
-            return True, future.result(timeout=timeout)
-        except concurrent.futures.TimeoutError:
-            print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
-            return False, None
-def time_it(func, *args, **kwargs):
-    """
-    Measure how long a function takes to run and return its result + time.
-    """
-    start = time.time()
-    result = func(*args, **kwargs)
-    end = time.time()
-    elapsed = end - start
-    print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
-    return result, elapsed
-# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
-def track_gemini_cost():
-  # Prices are per 1,000 tokens
-  PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-  PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-  PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-  return True
-def unique_preserve_order(seq):
-    seen = set()
-    return [x for x in seq if not (x in seen or seen.add(x))]
-# Main execution
-def pipeline_with_gemini(accessions):
-  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
-  # there can be one accession number in the accessions
-  # Prices are per 1,000 tokens
-  PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-  PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-  PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-  if not accessions:
-    print("no input")
-    return None
-  else:
-    accs_output = {}
-    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-    for acc in accessions:
-      print("start gemini: ", acc)
-      start = time.time()
-      total_cost_title = 0
-      jsonSM, links, article_text = {},[], ""
-      acc_score = { "isolate": "",
-                    "country":{},
-                   "sample_type":{},
-                   #"specific_location":{},
-                   #"ethnicity":{},
-                   "query_cost":total_cost_title,
-                   "time_cost":None,
-                   "source":links}
-      meta = mtdna_classifier.fetch_ncbi_metadata(acc)
-      country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
-      acc_score["isolate"] = iso
-      print(meta)
-      meta_expand = smart_fallback.fetch_ncbi(acc)
-      # set up step: create the folder to save document
-      chunk, all_output = "",""
-      if pudID:
-        id = str(pudID)
-        saveTitle = title
-      else:
-        try:
-          author_name = meta_expand["authors"].split(',')[0]  # Use last name only
-        except:
-          author_name = meta_expand["authors"]
-        saveTitle = title + "_" + col_date + "_" + author_name
-        id = "DirectSubmission"
-      # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
-      # if not folder_path.exists():
-      #     cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
-      #     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-      #     print("data/"+str(id) +" created.")
-      # else:
-      #     print("data/"+str(id) +" already exists.")
-      # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
-      # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
-      # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
-      # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
-      data_folder_id = GDRIVE_DATA_FOLDER_NAME  # Use the shared folder directly
-      sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
-      print("sample folder id: ", sample_folder_id)
-      # Define document names
-      if len(saveTitle) > 50:
-          saveName = saveTitle[:50]
-          saveName = saveName.replace(" ", "_")
-          chunk_filename = f"{saveName}_merged_document.docx"
-          all_filename = f"{saveName}_all_merged_document.docx"
-      else:
-          saveName = saveTitle.replace(" ", "_")
-          chunk_filename = f"{saveName}_merged_document.docx"
-          all_filename = f"{saveName}_all_merged_document.docx"
-      print(chunk_filename, all_filename)
-      # Define local temp paths for reading/writing
-      # import tempfile
-      # tmp_dir = tempfile.mkdtemp()
-      LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
-      os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
-      file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
-      file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
-      # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
-      # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
-      print(file_chunk_path)
-      chunk_id = find_drive_file(chunk_filename, sample_folder_id)
-      all_id = find_drive_file(all_filename, sample_folder_id)
-      if chunk_id and all_id:
-        print("✅ Files already exist in Google Drive. Downloading them...")
-        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
-        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
-        # Read and parse these into `chunk` and `all_output`
-      else:
-        # 🔥 Remove any stale local copies
-        if os.path.exists(file_chunk_path):
-            os.remove(file_chunk_path)
-            print(f"🗑️ Removed stale: {file_chunk_path}")
-        if os.path.exists(file_all_path):
-            os.remove(file_all_path)
-            print(f"🗑️ Removed stale: {file_all_path}")
-      # 🔥 Remove the local file first if it exists
-      # if os.path.exists(file_chunk_path):
-      #   os.remove(file_chunk_path)
-      #   print("remove chunk path")
-      # if os.path.exists(file_all_path):
-      #   os.remove(file_all_path)
-      #   print("remove all path")
-      # Try to download if already exists on Drive
-        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
-        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
-      print("chunk exist: ", chunk_exists)
-      # first way: ncbi method
-      print("country.lower: ",country.lower())
-      if country.lower() != "unknown":
-        stand_country = standardize_location.smart_country_lookup(country.lower())
-        print("stand_country: ", stand_country)
-        if stand_country.lower() != "not found":
-          acc_score["country"][stand_country.lower()] = ["ncbi"]
-        else: acc_score["country"][country.lower()] = ["ncbi"]
-      # if spe_loc.lower() != "unknown":
-      #   acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
-      # if ethnic.lower() != "unknown":
-      #   acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
-      if sample_type.lower() != "unknown":
-        acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
-      # second way: LLM model
-      # Preprocess the input token
-      print(acc_score)
-      accession, isolate = None, None
-      if acc != "unknown":  accession = acc
-      if iso != "unknown":  isolate = iso
-      # check doi first
-      if doi != "unknown":
-        link = 'https://doi.org/' + doi
-        # get the file to create listOfFile for each id
-        print("link of doi: ", link)
-        html = extractHTML.HTML("",link)
-        jsonSM = html.getSupMaterial()
-        article_text = html.getListSection()
-        if article_text:
-          if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
-            links.append(link)
-        if jsonSM:
-          links += sum((jsonSM[key] for key in jsonSM),[])
-      # no doi then google custom search api
-      if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
-        # might find the article
-        print("no article text")
-        #tem_links = mtdna_classifier.search_google_custom(title, 2)
-        tem_links = smart_fallback.smart_google_search(meta_expand)
-        # get supplementary of that article
-        print("tem links length ", len(tem_links))
-        for link in tem_links:
-          print("link in tem: ", link)
-          html = extractHTML.HTML("",link)
-          print("html yeh")
-          jsonSM = html.getSupMaterial()
-          print("jsonsm yeah")
-          article_text_tem = html.getListSection()
-          print(article_text_tem, jsonSM)
-          if article_text_tem:
-            if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text_tem.lower() or "403 Forbidden Request".lower() not in article_text_tem.lower():
-              links.append(link)
-              print("tem link appended ", link)
-          if jsonSM:
-            links += sum((jsonSM[key] for key in jsonSM),[])
-      print("this is links: ",links)
-      links = unique_preserve_order(links)
-      acc_score["source"] = links
-      # chunk_path = "/"+saveTitle+"_merged_document.docx"
-      # all_path = "/"+saveTitle+"_all_merged_document.docx"
-      # # if chunk and all output not exist yet
-      # file_chunk_path = saveLinkFolder + chunk_path
-      # file_all_path = saveLinkFolder + all_path
-      # if os.path.exists(file_chunk_path):
-      #   print("File chunk exists!")
-      #   if not chunk:
-      #     text, table, document_title = model.read_docx_text(file_chunk_path)
-      #     chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
-      # if os.path.exists(file_all_path):
-      #   print("File all output exists!")
-      #   if not all_output:
-      #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
-      #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
-      if chunk_exists:
-        print("File chunk exists!")
-        if not chunk:
-            print("start to get chunk")
-            text, table, document_title = model.read_docx_text(file_chunk_path)
-            chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
-      if all_exists:
-        print("File all output exists!")
-        if not all_output:
-            text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
-            all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
-      if not chunk and not all_output:
-        print("not chunk and all output")
-        # else: check if we can reuse these chunk and all output of existed accession to find another
-        if links:
-          for link in links:
-              print(link)
-              # if len(all_output) > 1000*1000:
-              #   all_output = data_preprocess.normalize_for_overlap(all_output)
-              #   print("after normalizing all output: ", len(all_output))
-              if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
-                print("break here")
-                break
-              if iso != "unknown": query_kw = iso
-              else: query_kw = acc
-              #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
-              success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw},timeout=180)
-              if success_process:
-                text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
-                print("yes succeed for process document")
-              else: text_link, tables_link, final_input_link = "", "", ""
-              context = data_preprocess.extract_context(final_input_link, query_kw)
-              if context !=  "Sample ID not found.":
-                if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
-                  success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
-                  if success_chunk:
-                    chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
-                    print("yes succeed for chunk")
-                  else:
-                    chunk += context
-                    print("len context: ", len(context))
-                    print("basic fall back")
-                print("len chunk after: ", len(chunk))
-              if len(final_input_link) > 1000*1000:
-                if context !=  "Sample ID not found.":
-                  final_input_link =  context
-                else:
-                  final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
-                  if len(final_input_link) > 1000 *1000:
-                    final_input_link = final_input_link[:100000]
-              if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
-                success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
-                if success:
-                  all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
-                  print("yes succeed")
-                else:
-                  all_output += final_input_link
-                  print("len final input: ", len(final_input_link))
-                  print("basic fall back")
-              print("len all output after: ", len(all_output))
-          #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
-        else:
-          chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-          all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-        if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-        if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-        if len(all_output) > 1*1024*1024:
-          all_output = data_preprocess.normalize_for_overlap(all_output)
-          if len(all_output) > 1*1024*1024:
-            all_output = all_output[:1*1024*1024]
-        print("chunk len: ", len(chunk))
-        print("all output len: ", len(all_output))
-        data_preprocess.save_text_to_docx(chunk, file_chunk_path)
-        data_preprocess.save_text_to_docx(all_output, file_all_path)
-        # Later when saving new files
-        # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
-        # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
-        # Upload to Drive
-        result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
-        result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
-        print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
-        print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
-        print("here 1")
-      # else:
-      #   final_input = ""
-      #   if all_output:
-      #     final_input = all_output
-      #   else:
-      #     if chunk: final_input = chunk
-      #   #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
-      #   if final_input:
-      #     keywords = []
-      #     if iso != "unknown":  keywords.append(iso)
-      #     if acc != "unknown":  keywords.append(acc)
-      #     for keyword in keywords:
-      #       chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
-      #       countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
-      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
-      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
-      # Define paths for cached RAG assets
-      # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
-      # document_chunks_path = saveLinkFolder+"/document_chunks.json"
-      # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
-      print("here 2")
-      faiss_filename = "faiss_index.bin"
-      chunks_filename = "document_chunks.json"
-      lookup_filename = "structured_lookup.json"
-      print("name of faiss: ", faiss_filename)
-      faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
-      document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
-      structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
-      print("name if faiss path: ", faiss_index_path)
-      # 🔥 Remove the local file first if it exists
-      faiss_id = find_drive_file(faiss_filename, sample_folder_id)
-      document_id = find_drive_file(chunks_filename, sample_folder_id)
-      structure_id = find_drive_file(lookup_filename, sample_folder_id)
-      if faiss_id and document_id and structure_id:
-        print("✅ 3 Files already exist in Google Drive. Downloading them...")
-        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
-        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
-        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
-        # Read and parse these into `chunk` and `all_output`
-      else:
-        if os.path.exists(faiss_index_path):
-            os.remove(faiss_index_path)
-        if os.path.exists(document_chunks_path):
-            os.remove(document_chunks_path)
-        if os.path.exists(structured_lookup_path):
-            os.remove(structured_lookup_path)
-        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
-        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
-        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
-      print("move to load rag")
-      master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
-          faiss_index_path, document_chunks_path, structured_lookup_path
-      )
-      global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
-      if not all_output:
-        if chunk: all_output = chunk
-        else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-      if faiss_index is None:
-          print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
-          total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
-              all_output
-          ).total_tokens
-          initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
-          total_cost_title += initial_embedding_cost
-          print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
-          master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
-              file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
-          )
-      else:
-          print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
-          plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
-          master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
-      primary_word = iso
-      alternative_word = acc
-      print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
-      if features.lower() not in all_output.lower():
-        all_output += ". NCBI Features: " + features
-      # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
-      #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
-      #     model.call_llm_api, chunk=chunk, all_output=all_output)
-      print("this is chunk for the model")
-      print(chunk)
-      print("this is all output for the model")
-      print(all_output)
-      country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
-          primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
-          model.call_llm_api, chunk=chunk, all_output=all_output)
-      print("country using ai: ", country)
-      print("sample type using ai: ", sample_type)
-      if len(country) == 0: country = "unknown"
-      if len(sample_type) == 0: sample_type = "unknown"
-      if country_explanation: country_explanation = "-"+country_explanation
-      else: country_explanation = ""
-      if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
-      else: sample_type_explanation = ""
-      if method_used == "unknown": method_used = ""
-      if country.lower() != "unknown":
-        stand_country = standardize_location.smart_country_lookup(country.lower())
-        if stand_country.lower() != "not found":
-          if stand_country.lower() in acc_score["country"]:
-            if country_explanation:
-              acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
-          else:
-            acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
-        else:
-          if country.lower() in acc_score["country"]:
-            if country_explanation:
-              if len(method_used + country_explanation) > 0:
-                acc_score["country"][country.lower()].append(method_used + country_explanation)
-          else:
-            if len(method_used + country_explanation) > 0:
-              acc_score["country"][country.lower()] = [method_used + country_explanation]
-      # if spe_loc.lower() != "unknown":
-      #   if spe_loc.lower() in acc_score["specific_location"]:
-      #     acc_score["specific_location"][spe_loc.lower()].append(method_used)
-      #   else:
-      #     acc_score["specific_location"][spe_loc.lower()] = [method_used]
-      # if ethnic.lower() != "unknown":
-      #   if ethnic.lower() in acc_score["ethnicity"]:
-      #     acc_score["ethnicity"][ethnic.lower()].append(method_used)
-      #   else:
-      #     acc_score["ethnicity"][ethnic.lower()] = [method_used]
-      if sample_type.lower() != "unknown":
-        if sample_type.lower() in acc_score["sample_type"]:
-          if len(method_used + sample_type_explanation) > 0:
-            acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
-        else:
-          if len(method_used + sample_type_explanation)> 0:
-            acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
-      # last resort: combine all information to give all output otherwise unknown
-      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
-        text = ""
-        for key in meta_expand:
-          text += str(key) + ": " + meta_expand[key] + "\n"
-        if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
-          text += data_preprocess.normalize_for_overlap(all_output)
-        if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
-          text += data_preprocess.normalize_for_overlap(chunk)
-        text += ". NCBI Features: " + features
-        print("this is text for the last resort model")
-        print(text)
-        country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
-            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
-            model.call_llm_api, chunk=text, all_output=text)
-        print("this is last resort results: ")
-        print("country: ", country)
-        print("sample type: ", sample_type)
-        if len(country) == 0: country = "unknown"
-        if len(sample_type) == 0: sample_type = "unknown"
-        if country_explanation: country_explanation = "-"+country_explanation
-        else: country_explanation = ""
-        if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
-        else: sample_type_explanation = ""
-        if method_used == "unknown": method_used = ""
-        if country.lower() != "unknown":
-          stand_country = standardize_location.smart_country_lookup(country.lower())
-          if stand_country.lower() != "not found":
-            if stand_country.lower() in acc_score["country"]:
-              if country_explanation:
-                acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
-            else:
-              acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
-          else:
-            if country.lower() in acc_score["country"]:
-              if country_explanation:
-                if len(method_used + country_explanation) > 0:
-                  acc_score["country"][country.lower()].append(method_used + country_explanation)
-            else:
-              if len(method_used + country_explanation) > 0:
-                acc_score["country"][country.lower()] = [method_used + country_explanation]
-        if sample_type.lower() != "unknown":
-            if sample_type.lower() in acc_score["sample_type"]:
-              if len(method_used + sample_type_explanation) > 0:
-                acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
-            else:
-              if len(method_used + sample_type_explanation)> 0:
-                acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
-      end = time.time()
-      total_cost_title += total_query_cost
-      acc_score["query_cost"] = f"{total_cost_title:.6f}"
-      elapsed = end - start
-      acc_score["time_cost"] = f"{elapsed:.3f} seconds"
-      accs_output[acc] = acc_score
-      print(accs_output[acc])
     return accs_output

+# test1: MJ17 direct
+# test2: "A1YU101" thailand cross-ref
+# test3: "EBK109" thailand cross-ref
+# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
+import data_preprocess
+import model
+import mtdna_classifier
+#import app
+import smart_fallback
+import pandas as pd
+from pathlib import Path
+import subprocess
+from NER.html import extractHTML
+import os
+import google.generativeai as genai
+import re
+import standardize_location
+# Helper functions in for this pipeline
+# Track time
+import time
+import multiprocessing
+import gspread
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
+from google.oauth2.service_account import Credentials
+from oauth2client.service_account import ServiceAccountCredentials
+import io
+import json
+#––– Authentication setup –––
+GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
+GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
+GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
+GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
+drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
+def get_or_create_drive_folder(name, parent_id=None):
+    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
+    if parent_id:
+        query += f" and '{parent_id}' in parents"
+    results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
+    items = results.get("files", [])
+    if items:
+        return items[0]["id"]
+    file_metadata = {
+        "name": name,
+        "mimeType": "application/vnd.google-apps.folder"
+    }
+    if parent_id:
+        file_metadata["parents"] = [parent_id]
+    file = drive_service.files().create(body=file_metadata, fields="id").execute()
+    return file["id"]
+def find_drive_file(filename, parent_id):
+    """
+    Checks if a file with the given name exists inside the specified Google Drive folder.
+    Returns the file ID if found, else None.
+    """
+    query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
+    results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
+    files = results.get('files', [])
+    if files:
+        return files[0]["id"]
+    return None
+# def upload_file_to_drive(local_path, remote_name, folder_id):
+#     file_metadata = {"name": remote_name, "parents": [folder_id]}
+#     media = MediaFileUpload(local_path, resumable=True)
+#     existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
+#     if existing:
+#         drive_service.files().delete(fileId=existing[0]["id"]).execute()
+#     file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
+#     result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
+#     if not result.get("files"):
+#         print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
+#     else:
+#         print(f"✅ Verified upload: {remote_name}")
+#     return file["id"]
+def upload_file_to_drive(local_path, remote_name, folder_id):
+    try:
+        if not os.path.exists(local_path):
+            raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
+        # Delete existing file on Drive if present
+        existing = drive_service.files().list(
+            q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
+            fields="files(id)"
+        ).execute().get("files", [])
+        if existing:
+            drive_service.files().delete(fileId=existing[0]["id"]).execute()
+            print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}")
+        file_metadata = {"name": remote_name, "parents": [folder_id]}
+        media = MediaFileUpload(local_path, resumable=True)
+        file = drive_service.files().create(
+            body=file_metadata,
+            media_body=media,
+            fields="id"
+        ).execute()
+        print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
+        return file["id"]
+    except Exception as e:
+        print(f"❌ Error during upload: {e}")
+        return None
+def download_file_from_drive(remote_name, folder_id, local_path):
+    results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
+    files = results.get("files", [])
+    if not files:
+        return False
+    file_id = files[0]["id"]
+    request = drive_service.files().get_media(fileId=file_id)
+    fh = io.FileIO(local_path, 'wb')
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    while not done:
+        _, done = downloader.next_chunk()
+    return True
+def download_drive_file_content(file_id):
+    request = drive_service.files().get_media(fileId=file_id)
+    fh = io.BytesIO()
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    while not done:
+        _, done = downloader.next_chunk()
+    fh.seek(0)
+    return fh.read().decode("utf-8")
+# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
+#     """
+#     Runs `func` with timeout in seconds. Kills if it exceeds.
+#     Returns: (success, result or None)
+#     """
+#     def wrapper(q, *args, **kwargs):
+#         try:
+#             q.put(func(*args, **kwargs))
+#         except Exception as e:
+#             q.put(e)
+#     q = multiprocessing.Queue()
+#     p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
+#     p.start()
+#     p.join(timeout)
+#     if p.is_alive():
+#         p.terminate()
+#         p.join()
+#         print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
+#         return False, None
+#     else:
+#         result = q.get()
+#         if isinstance(result, Exception):
+#             raise result
+#         return True, result
+def run_with_timeout(func, args=(), kwargs={}, timeout=30):
+    import concurrent.futures
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            return True, future.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
+            return False, None
+def time_it(func, *args, **kwargs):
+    """
+    Measure how long a function takes to run and return its result + time.
+    """
+    start = time.time()
+    result = func(*args, **kwargs)
+    end = time.time()
+    elapsed = end - start
+    print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
+    return result, elapsed
+# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
+def track_gemini_cost():
+  # Prices are per 1,000 tokens
+  PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
+  PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
+  PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
+  return True
+def unique_preserve_order(seq):
+    seen = set()
+    return [x for x in seq if not (x in seen or seen.add(x))]
+# Main execution
+def pipeline_with_gemini(accessions):
+  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
+  # there can be one accession number in the accessions
+  # Prices are per 1,000 tokens
+  PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
+  PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
+  PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
+  if not accessions:
+    print("no input")
+    return None
+  else:
+    accs_output = {}
+    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+    for acc in accessions:
+      print("start gemini: ", acc)
+      start = time.time()
+      total_cost_title = 0
+      jsonSM, links, article_text = {},[], ""
+      acc_score = { "isolate": "",
+                    "country":{},
+                   "sample_type":{},
+                   #"specific_location":{},
+                   #"ethnicity":{},
+                   "query_cost":total_cost_title,
+                   "time_cost":None,
+                   "source":links}
+      meta = mtdna_classifier.fetch_ncbi_metadata("unknown")
+      country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
+      acc_score["isolate"] = iso
+      print("meta: ",meta)
+      meta_expand = smart_fallback.fetch_ncbi("unknown")
+      print("meta expand: ", meta_expand)
+      # set up step: create the folder to save document
+      chunk, all_output = "",""
+      if pudID:
+        id = str(pudID)
+        saveTitle = title
+      else:
+        try:
+          author_name = meta_expand["authors"].split(',')[0]  # Use last name only
+        except:
+          author_name = meta_expand["authors"]
+        saveTitle = title + "_" + col_date + "_" + author_name
+        id = "DirectSubmission"
+      # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
+      # if not folder_path.exists():
+      #     cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
+      #     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      #     print("data/"+str(id) +" created.")
+      # else:
+      #     print("data/"+str(id) +" already exists.")
+      # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
+      # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
+      # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
+      # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      data_folder_id = GDRIVE_DATA_FOLDER_NAME  # Use the shared folder directly
+      sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      print("sample folder id: ", sample_folder_id)
+      # Define document names
+      if len(saveTitle) > 50:
+          saveName = saveTitle[:50]
+          saveName = saveName.replace(" ", "_")
+          chunk_filename = f"{saveName}_merged_document.docx"
+          all_filename = f"{saveName}_all_merged_document.docx"
+      else:
+          saveName = saveTitle.replace(" ", "_")
+          chunk_filename = f"{saveName}_merged_document.docx"
+          all_filename = f"{saveName}_all_merged_document.docx"
+      print(chunk_filename, all_filename)
+      # Define local temp paths for reading/writing
+      # import tempfile
+      # tmp_dir = tempfile.mkdtemp()
+      LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
+      os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
+      file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
+      file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
+      # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
+      # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
+      print(file_chunk_path)
+      chunk_id = find_drive_file(chunk_filename, sample_folder_id)
+      all_id = find_drive_file(all_filename, sample_folder_id)
+      if chunk_id and all_id:
+        print("✅ Files already exist in Google Drive. Downloading them...")
+        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
+        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+        print("chunk_id and all_id: ")
+        print(chunk_id, all_id)
+        file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
+        print("📄 Name:", file["name"])
+        print("📁 Parent folder ID:", file["parents"][0])
+        print("🔗 View link:", file["webViewLink"])
+        # Read and parse these into `chunk` and `all_output`
+      else:
+        # 🔥 Remove any stale local copies
+        if os.path.exists(file_chunk_path):
+            os.remove(file_chunk_path)
+            print(f"🗑️ Removed stale: {file_chunk_path}")
+        if os.path.exists(file_all_path):
+            os.remove(file_all_path)
+            print(f"🗑️ Removed stale: {file_all_path}")
+      # 🔥 Remove the local file first if it exists
+      # if os.path.exists(file_chunk_path):
+      #   os.remove(file_chunk_path)
+      #   print("remove chunk path")
+      # if os.path.exists(file_all_path):
+      #   os.remove(file_all_path)
+      #   print("remove all path")
+      # Try to download if already exists on Drive
+        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
+        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+      print("chunk exist: ", chunk_exists)
+      # first way: ncbi method
+      print("country.lower: ",country.lower())
+      if country.lower() != "unknown":
+        stand_country = standardize_location.smart_country_lookup(country.lower())
+        print("stand_country: ", stand_country)
+        if stand_country.lower() != "not found":
+          acc_score["country"][stand_country.lower()] = ["ncbi"]
+        else: acc_score["country"][country.lower()] = ["ncbi"]
+      # if spe_loc.lower() != "unknown":
+      #   acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
+      # if ethnic.lower() != "unknown":
+      #   acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
+      if sample_type.lower() != "unknown":
+        acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
+      # second way: LLM model
+      # Preprocess the input token
+      print(acc_score)
+      accession, isolate = None, None
+      if acc != "unknown":  accession = acc
+      if iso != "unknown":  isolate = iso
+      # check doi first
+      if doi != "unknown":
+        link = 'https://doi.org/' + doi
+        # get the file to create listOfFile for each id
+        print("link of doi: ", link)
+        html = extractHTML.HTML("",link)
+        jsonSM = html.getSupMaterial()
+        article_text = html.getListSection()
+        if article_text:
+          if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
+            links.append(link)
+        if jsonSM:
+          links += sum((jsonSM[key] for key in jsonSM),[])
+      # no doi then google custom search api
+      if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
+        # might find the article
+        print("no article text, start tem link")
+        #tem_links = mtdna_classifier.search_google_custom(title, 2)
+        tem_links = smart_fallback.smart_google_search(meta_expand)
+        print("tem links: ", tem_links)
+        tem_link_acc = smart_fallback.google_accession_search(acc)
+        tem_links += tem_link_acc
+        tem_links = unique_preserve_order(tem_links)
+        print("tem link before filtering: ", tem_links)
+        # filter the quality link
+        print("saveLinkFolder as sample folder id: ", sample_folder_id)
+        links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc)
+      print("this is links: ",links)
+      links = unique_preserve_order(links)
+      acc_score["source"] = links
+      # chunk_path = "/"+saveTitle+"_merged_document.docx"
+      # all_path = "/"+saveTitle+"_all_merged_document.docx"
+      # # if chunk and all output not exist yet
+      # file_chunk_path = saveLinkFolder + chunk_path
+      # file_all_path = saveLinkFolder + all_path
+      # if os.path.exists(file_chunk_path):
+      #   print("File chunk exists!")
+      #   if not chunk:
+      #     text, table, document_title = model.read_docx_text(file_chunk_path)
+      #     chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+      # if os.path.exists(file_all_path):
+      #   print("File all output exists!")
+      #   if not all_output:
+      #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+      #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+      if chunk_exists:
+        print("File chunk exists!")
+        if not chunk:
+            print("start to get chunk")
+            text, table, document_title = model.read_docx_text(file_chunk_path)
+            chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+      if all_exists:
+        print("File all output exists!")
+        if not all_output:
+            text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+            all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+      if not chunk and not all_output:
+        print("not chunk and all output")
+        # else: check if we can reuse these chunk and all output of existed accession to find another
+        if links:
+          for link in links:
+              print(link)
+              # if len(all_output) > 1000*1000:
+              #   all_output = data_preprocess.normalize_for_overlap(all_output)
+              #   print("after normalizing all output: ", len(all_output))
+              if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
+                print("break here")
+                break
+              if iso != "unknown": query_kw = iso
+              else: query_kw = acc
+              #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
+              success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=180)
+              if success_process:
+                text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
+                print("yes succeed for process document")
+              else: text_link, tables_link, final_input_link = "", "", ""
+              context = data_preprocess.extract_context(final_input_link, query_kw)
+              if context !=  "Sample ID not found.":
+                if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
+                  success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
+                  if success_chunk:
+                    chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+                    print("yes succeed for chunk")
+                  else:
+                    chunk += context
+                    print("len context: ", len(context))
+                    print("basic fall back")
+                print("len chunk after: ", len(chunk))
+              if len(final_input_link) > 1000*1000:
+                if context !=  "Sample ID not found.":
+                  final_input_link =  context
+                else:
+                  final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
+                  if len(final_input_link) > 1000 *1000:
+                    final_input_link = final_input_link[:100000]
+              if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
+                success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
+                if success:
+                  all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+                  print("yes succeed")
+                else:
+                  all_output += final_input_link
+                  print("len final input: ", len(final_input_link))
+                  print("basic fall back")
+              print("len all output after: ", len(all_output))
+          #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
+        else:
+          chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+          all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+        if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+        if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+        if len(all_output) > 1*1024*1024:
+          all_output = data_preprocess.normalize_for_overlap(all_output)
+          if len(all_output) > 1*1024*1024:
+            all_output = all_output[:1*1024*1024]
+        print("chunk len: ", len(chunk))
+        print("all output len: ", len(all_output))
+        data_preprocess.save_text_to_docx(chunk, file_chunk_path)
+        data_preprocess.save_text_to_docx(all_output, file_all_path)
+        # Later when saving new files
+        # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
+        # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
+        # Upload to Drive
+        result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
+        result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
+        print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
+        print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
+        print("here 1")
+      # else:
+      #   final_input = ""
+      #   if all_output:
+      #     final_input = all_output
+      #   else:
+      #     if chunk: final_input = chunk
+      #   #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
+      #   if final_input:
+      #     keywords = []
+      #     if iso != "unknown":  keywords.append(iso)
+      #     if acc != "unknown":  keywords.append(acc)
+      #     for keyword in keywords:
+      #       chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
+      #       countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
+      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
+      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
+      # Define paths for cached RAG assets
+      # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
+      # document_chunks_path = saveLinkFolder+"/document_chunks.json"
+      # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
+      print("here 2")
+      faiss_filename = "faiss_index.bin"
+      chunks_filename = "document_chunks.json"
+      lookup_filename = "structured_lookup.json"
+      print("name of faiss: ", faiss_filename)
+      faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
+      document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
+      structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
+      print("name if faiss path: ", faiss_index_path)
+      # 🔥 Remove the local file first if it exists
+      faiss_id = find_drive_file(faiss_filename, sample_folder_id)
+      document_id = find_drive_file(chunks_filename, sample_folder_id)
+      structure_id = find_drive_file(lookup_filename, sample_folder_id)
+      if faiss_id and document_id and structure_id:
+        print("✅ 3 Files already exist in Google Drive. Downloading them...")
+        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
+        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
+        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
+        # Read and parse these into `chunk` and `all_output`
+      else:
+        if os.path.exists(faiss_index_path):
+            os.remove(faiss_index_path)
+        if os.path.exists(document_chunks_path):
+            os.remove(document_chunks_path)
+        if os.path.exists(structured_lookup_path):
+            os.remove(structured_lookup_path)
+        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
+        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
+        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
+      print("move to load rag")
+      master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
+          faiss_index_path, document_chunks_path, structured_lookup_path
+      )
+      global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
+      if not all_output:
+        if chunk: all_output = chunk
+        else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+      if faiss_index is None:
+          print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
+          total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
+              all_output
+          ).total_tokens
+          initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
+          total_cost_title += initial_embedding_cost
+          print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
+          master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
+              file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
+          )
+      else:
+          print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
+          plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
+          master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
+      primary_word = iso
+      alternative_word = acc
+      print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
+      if features.lower() not in all_output.lower():
+        all_output += ". NCBI Features: " + features
+      # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
+      #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+      #     model.call_llm_api, chunk=chunk, all_output=all_output)
+      print("this is chunk for the model")
+      print(chunk)
+      print("this is all output for the model")
+      print(all_output)
+      country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
+          primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+          model.call_llm_api, chunk=chunk, all_output=all_output)
+      print("country using ai: ", country)
+      print("sample type using ai: ", sample_type)
+      if len(country) == 0: country = "unknown"
+      if len(sample_type) == 0: sample_type = "unknown"
+      if country_explanation: country_explanation = "-"+country_explanation
+      else: country_explanation = ""
+      if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+      else: sample_type_explanation = ""
+      if method_used == "unknown": method_used = ""
+      if country.lower() != "unknown":
+        stand_country = standardize_location.smart_country_lookup(country.lower())
+        if stand_country.lower() != "not found":
+          if stand_country.lower() in acc_score["country"]:
+            if country_explanation:
+              acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
+          else:
+            acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
+        else:
+          if country.lower() in acc_score["country"]:
+            if country_explanation:
+              if len(method_used + country_explanation) > 0:
+                acc_score["country"][country.lower()].append(method_used + country_explanation)
+          else:
+            if len(method_used + country_explanation) > 0:
+              acc_score["country"][country.lower()] = [method_used + country_explanation]
+      # if spe_loc.lower() != "unknown":
+      #   if spe_loc.lower() in acc_score["specific_location"]:
+      #     acc_score["specific_location"][spe_loc.lower()].append(method_used)
+      #   else:
+      #     acc_score["specific_location"][spe_loc.lower()] = [method_used]
+      # if ethnic.lower() != "unknown":
+      #   if ethnic.lower() in acc_score["ethnicity"]:
+      #     acc_score["ethnicity"][ethnic.lower()].append(method_used)
+      #   else:
+      #     acc_score["ethnicity"][ethnic.lower()] = [method_used]
+      if sample_type.lower() != "unknown":
+        if sample_type.lower() in acc_score["sample_type"]:
+          if len(method_used + sample_type_explanation) > 0:
+            acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
+        else:
+          if len(method_used + sample_type_explanation)> 0:
+            acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
+      # last resort: combine all information to give all output otherwise unknown
+      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
+        text = ""
+        for key in meta_expand:
+          text += str(key) + ": " + meta_expand[key] + "\n"
+        if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
+          text += data_preprocess.normalize_for_overlap(all_output)
+        if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
+          text += data_preprocess.normalize_for_overlap(chunk)
+        text += ". NCBI Features: " + features
+        print("this is text for the last resort model")
+        print(text)
+        country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
+            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+            model.call_llm_api, chunk=text, all_output=text)
+        print("this is last resort results: ")
+        print("country: ", country)
+        print("sample type: ", sample_type)
+        if len(country) == 0: country = "unknown"
+        if len(sample_type) == 0: sample_type = "unknown"
+        if country_explanation: country_explanation = "-"+country_explanation
+        else: country_explanation = ""
+        if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+        else: sample_type_explanation = ""
+        if method_used == "unknown": method_used = ""
+        if country.lower() != "unknown":
+          stand_country = standardize_location.smart_country_lookup(country.lower())
+          if stand_country.lower() != "not found":
+            if stand_country.lower() in acc_score["country"]:
+              if country_explanation:
+                acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
+            else:
+              acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
+          else:
+            if country.lower() in acc_score["country"]:
+              if country_explanation:
+                if len(method_used + country_explanation) > 0:
+                  acc_score["country"][country.lower()].append(method_used + country_explanation)
+            else:
+              if len(method_used + country_explanation) > 0:
+                acc_score["country"][country.lower()] = [method_used + country_explanation]
+        if sample_type.lower() != "unknown":
+            if sample_type.lower() in acc_score["sample_type"]:
+              if len(method_used + sample_type_explanation) > 0:
+                acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
+            else:
+              if len(method_used + sample_type_explanation)> 0:
+                acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
+      end = time.time()
+      total_cost_title += total_query_cost
+      acc_score["query_cost"] = f"{total_cost_title:.6f}"
+      elapsed = end - start
+      acc_score["time_cost"] = f"{elapsed:.3f} seconds"
+      accs_output[acc] = acc_score
+      print(accs_output[acc])
     return accs_output

requirements.txt CHANGED Viewed

@@ -1,44 +1,44 @@
-biopython==1.85
-bs4==0.0.2
-gensim==4.3.3
-gradio
-gspread==6.2.0
-gspread-dataframe==4.0.0
-huggingface-hub==0.30.2
-nltk==3.9.1
-oauth2client==4.1.3
-openai==1.76.2
-openpyxl==3.1.5
-pandas==2.2.2
-pdfreader==0.1.15
-PyMuPDF==1.25.5
-pytest==8.3.5
-requests==2.32.3
-scikit-learn==1.6.1
-scipy==1.13.1
-spacy==3.8.5
-spacy-lookups-data==1.0.5
-spire-doc==13.4.6
-Spire.Xls==14.12.0
-statsmodels==0.14.4
-tabula-py==2.10.0
-thefuzz==0.22.1
-torch
-transformers==4.51.3
-wordsegment==1.3.1
-xlrd==2.0.1
-sentence-transformers
-lxml
-streamlit
-requests
-google-generativeai
-PyPDF2
-beautifulsoup4
-# For Claude
-anthropic
-faiss-cpu
-python-docx
-pycountry
-# For Deepseek (If direct DeepseekLLM client library is available, use it.
-# Otherwise, 'requests' covers it for simple API calls, but a dedicated client is better for full features)
 # deepseek-llm # Uncomment this if Deepseek provides a dedicated pip package for their LLM

+biopython==1.85
+bs4==0.0.2
+gensim==4.3.3
+gradio
+gspread==6.2.0
+gspread-dataframe==4.0.0
+huggingface-hub==0.30.2
+nltk==3.9.1
+oauth2client==4.1.3
+openai==1.76.2
+openpyxl==3.1.5
+pandas==2.2.2
+pdfreader==0.1.15
+PyMuPDF==1.25.5
+pytest==8.3.5
+requests==2.32.3
+scikit-learn==1.6.1
+scipy==1.13.1
+spacy==3.8.5
+spacy-lookups-data==1.0.5
+spire-doc==13.4.6
+Spire.Xls==14.12.0
+statsmodels==0.14.4
+tabula-py==2.10.0
+thefuzz==0.22.1
+torch
+transformers==4.51.3
+wordsegment==1.3.1
+xlrd==2.0.1
+sentence-transformers
+lxml
+streamlit
+requests
+google-generativeai
+PyPDF2
+beautifulsoup4
+# For Claude
+anthropic
+faiss-cpu
+python-docx
+pycountry
+# For Deepseek (If direct DeepseekLLM client library is available, use it.
+# Otherwise, 'requests' covers it for simple API calls, but a dedicated client is better for full features)
 # deepseek-llm # Uncomment this if Deepseek provides a dedicated pip package for their LLM

smart_fallback.py CHANGED Viewed

@@ -1,156 +1,205 @@
-from Bio import Entrez, Medline
-import model
-import mtdna_classifier
-# Setup
-def fetch_ncbi(accession_number):
-  Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
-  handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
-  record = Entrez.read(handle)
-  handle.close()
-  outputs = {"authors":"unknown",
-            "institution":"unknown",
-            "isolate":"unknown",
-            "definition":"unknown",
-            "title":"unknown",
-            "seq_comment":"unknown",
-            "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
-  gb_seq = None
-  try:
-    # Validate record structure: It should be a list with at least one element (a dict)
-    if isinstance(record, list) and len(record) > 0:
-        if isinstance(record[0], dict):
-            gb_seq = record[0]
-        else:
-            print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
-        # extract collection date
-        if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
-          outputs["collection_date"] = gb_seq["GBSeq_create-date"]
-        else:
-          if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
-            outputs["collection_date"] = gb_seq["GBSeq_update-date"]
-        # extract definition
-        if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
-          outputs["definition"] = gb_seq["GBSeq_definition"]
-        # extract related-reference things
-        if "GBSeq_references" in gb_seq:
-          for ref in gb_seq["GBSeq_references"]:
-            # extract authors
-            if "GBReference_authors" in ref and outputs["authors"]=="unknown":
-              outputs["authors"] = "and ".join(ref["GBReference_authors"])
-            # extract title
-            if "GBReference_title" in ref and outputs["title"]=="unknown":
-              outputs["title"] = ref["GBReference_title"]
-            #  extract submitted journal
-            if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
-              outputs["institution"] = ref['GBReference_journal']
-        # extract seq_comment
-        if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
-          outputs["seq_comment"] = gb_seq["GBSeq_comment"]
-        # extract isolate
-        if "GBSeq_feature-table" in gb_seq:
-          if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
-            for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
-              if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
-                outputs["isolate"] = ref["GBQualifier_value"]
-    else:
-        print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
-    # If gb_seq is still None, return defaults
-    if gb_seq is None:
-        return {"authors":"unknown",
-              "institution":"unknown",
-              "isolate":"unknown",
-              "definition":"unknown",
-              "title":"unknown",
-              "seq_comment":"unknown",
-              "collection_date":"unknown" }
-    return outputs
-  except:
-    print("error in fetching ncbi data")
-    return {"authors":"unknown",
-              "institution":"unknown",
-              "isolate":"unknown",
-              "definition":"unknown",
-              "title":"unknown",
-              "seq_comment":"unknown",
-              "collection_date":"unknown" }
-# Method 1: Smarter Google
-def smart_google_queries(metadata: dict):
-    queries = []
-    # Extract useful fields
-    isolate = metadata.get("isolate")
-    author = metadata.get("authors")
-    institution = metadata.get("institution")
-    title = metadata.get("title")
-    print(title)
-    combined = []
-    # Construct queries
-    if isolate:
-        queries.append(f'"{isolate}" mitochondrial DNA')
-        queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
-    if author:
-        try:
-          author_name = author.split(',')[0]  # Use last name only
-        except:
-          author_name = author
-        queries.append(f'"{author_name}" mitochondrial DNA')
-        queries.append(f'"{author_name}" mtDNA site:researchgate.net')
-    if institution:
-        try:
-          short_inst = institution.split(',')[0]  # Take first part of institution
-        except:
-          short_inst = institution
-        queries.append(f'"{short_inst}" mtDNA sequence')
-        queries.append(f'"{short_inst}" isolate site:nature.com')
-    queries.append(title)
-    return queries
-def filter_links_by_metadata(search_results):
-    TRUSTED_DOMAINS = [
-    "ncbi.nlm.nih.gov",
-    "pubmed.ncbi.nlm.nih.gov",
-    "pmc.ncbi.nlm.nih.gov",
-    "biorxiv.org",
-    "researchgate.net",
-    "nature.com",
-    "sciencedirect.com"
-    ]
-    def is_trusted_link(link):
-      for domain in TRUSTED_DOMAINS:
-        if domain in link:
-          return True
-      return False
-    def is_relevant_title_snippet(link):
-      keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
-      title_snippet = link.lower()
-      for keyword in keywords:
-        if keyword in title_snippet:
-          return True
-      return False
-    filtered = []
-    if len(search_results) > 0:
-      for link in search_results:
-          if is_trusted_link(link) and link not in filtered:
-              filtered.append(link)
-          if is_relevant_title_snippet(link) and link not in filtered:
-              filtered.append(link)
-    return filtered
-def smart_google_search(metadata):
-  queries = smart_google_queries(metadata)
-  links = []
-  for q in queries:
-      #print("\n🔍 Query:", q)
-      results = mtdna_classifier.search_google_custom(q,2)
-      for link in results:
-          #print(f"- {link}")
-          if link not in links:
-              links.append(link)
-  filter_links = filter_links_by_metadata(links)
-  return filter_links
-# Method 2: Prompt LLM better or better ai search api with all
 # the total information from even ncbi and all search

+from Bio import Entrez, Medline
+#import model
+import mtdna_classifier
+from NER.html import extractHTML
+import data_preprocess
+# Setup
+def fetch_ncbi(accession_number):
+  try:
+    Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
+    handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
+    record = Entrez.read(handle)
+    handle.close()
+    outputs = {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
+    gb_seq = None
+    # Validate record structure: It should be a list with at least one element (a dict)
+    if isinstance(record, list) and len(record) > 0:
+        if isinstance(record[0], dict):
+            gb_seq = record[0]
+        else:
+            print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
+        # extract collection date
+        if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
+          outputs["collection_date"] = gb_seq["GBSeq_create-date"]
+        else:
+          if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
+            outputs["collection_date"] = gb_seq["GBSeq_update-date"]
+        # extract definition
+        if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
+          outputs["definition"] = gb_seq["GBSeq_definition"]
+        # extract related-reference things
+        if "GBSeq_references" in gb_seq:
+          for ref in gb_seq["GBSeq_references"]:
+            # extract authors
+            if "GBReference_authors" in ref and outputs["authors"]=="unknown":
+              outputs["authors"] = "and ".join(ref["GBReference_authors"])
+            # extract title
+            if "GBReference_title" in ref and outputs["title"]=="unknown":
+              outputs["title"] = ref["GBReference_title"]
+            #  extract submitted journal
+            if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
+              outputs["institution"] = ref['GBReference_journal']
+        # extract seq_comment
+        if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
+          outputs["seq_comment"] = gb_seq["GBSeq_comment"]
+        # extract isolate
+        if "GBSeq_feature-table" in gb_seq:
+          if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
+            for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
+              if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
+                outputs["isolate"] = ref["GBQualifier_value"]
+    else:
+        print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
+    # If gb_seq is still None, return defaults
+    if gb_seq is None:
+        return {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" }
+    return outputs
+  except:
+    print("error in fetching ncbi data")
+    return {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" }
+# Fallback if NCBI crashed or cannot find accession on NBCI
+def google_accession_search(accession_id):
+    """
+    Search for metadata by accession ID using Google Custom Search.
+    Falls back to known biological databases and archives.
+    """
+    queries = [
+        f"{accession_id}",
+        f"{accession_id} site:ncbi.nlm.nih.gov",
+        f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
+        f"{accession_id} site:europepmc.org",
+        f"{accession_id} site:researchgate.net",
+        f"{accession_id} mtDNA",
+        f"{accession_id} mitochondrial DNA"
+    ]
+    links = []
+    for query in queries:
+        search_results = mtdna_classifier.search_google_custom(query, 2)
+        for link in search_results:
+            if link not in links:
+                links.append(link)
+    return links
+# Method 1: Smarter Google
+def smart_google_queries(metadata: dict):
+    queries = []
+    # Extract useful fields
+    isolate = metadata.get("isolate")
+    author = metadata.get("authors")
+    institution = metadata.get("institution")
+    title = metadata.get("title")
+    combined = []
+    # Construct queries
+    if isolate and isolate!="unknown":
+        queries.append(f'"{isolate}" mitochondrial DNA')
+        queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
+    if author and author!="unknown":
+        try:
+          author_name = author.split(',')[0]  # Use last name only
+        except:
+          author_name = author
+        queries.append(f'"{author_name}" mitochondrial DNA')
+        queries.append(f'"{author_name}" mtDNA site:researchgate.net')
+    if institution and institution!="unknown":
+        try:
+          short_inst = institution.split(',')[0]  # Take first part of institution
+        except:
+          short_inst = institution
+        queries.append(f'"{short_inst}" mtDNA sequence')
+        queries.append(f'"{short_inst}" isolate site:nature.com')
+    if title and title!='unknown':
+      if title!="Direct Submission":
+        queries.append(title)
+    return queries
+def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
+    TRUSTED_DOMAINS = [
+    "ncbi.nlm.nih.gov",
+    "pubmed.ncbi.nlm.nih.gov",
+    "pmc.ncbi.nlm.nih.gov",
+    "biorxiv.org",
+    "researchgate.net",
+    "nature.com",
+    "sciencedirect.com"
+    ]
+    def is_trusted_link(link):
+      for domain in TRUSTED_DOMAINS:
+        if domain in link:
+          return True
+      return False
+    def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
+      output = []
+      keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
+      if accession:
+        keywords = [accession] + keywords
+      title_snippet = link.lower()
+      print("save link folder inside this filter function: ", saveLinkFolder)
+      article_text = data_preprocess.extract_text(link,saveLinkFolder)
+      print("article text")
+      print(article_text)
+      if link.split(".")[-1].lower():
+          if link.split(".")[-1].lower() != "pdf" and link.split(".")[-1].lower() not in "docx" and link.split(".")[-1].lower() not in "xlxs":
+              html = extractHTML.HTML("",link)
+              jsonSM = html.getSupMaterial()
+              if jsonSM: output += sum((jsonSM[key] for key in jsonSM),[])
+      for keyword in keywords:
+        if keyword.lower() in article_text.lower():
+          if link not in output:
+            output.append(link)
+          print("link and keyword: ", link, keyword)
+          return output
+        if keyword.lower() in title_snippet.lower():
+          if link not in output:
+            output.append(link)
+          print("link and keyword: ", link, keyword)
+          return output
+      return output
+    filtered = []
+    if len(search_results) > 0:
+      for link in search_results:
+          if is_trusted_link(link):
+            if link not in filtered:
+              filtered.append(link)
+          else:
+            output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
+            filtered += output_link
+    return filtered
+def smart_google_search(metadata):
+  queries = smart_google_queries(metadata)
+  links = []
+  for q in queries:
+      #print("\n🔍 Query:", q)
+      results = mtdna_classifier.search_google_custom(q,2)
+      for link in results:
+          #print(f"- {link}")
+          if link not in links:
+              links.append(link)
+  #filter_links = filter_links_by_metadata(links)
+  return links
+# Method 2: Prompt LLM better or better ai search api with all
 # the total information from even ncbi and all search

standardize_location.py CHANGED Viewed

@@ -1,83 +1,83 @@
-import requests
-import re
-import os
-# Normalize input
-def normalize_key(text):
-    return re.sub(r"[^a-z0-9]", "", text.strip().lower())
-# Search for city/place (normal flow)
-def get_country_from_geonames(city_name):
-    url = os.environ["URL_SEARCHJSON"]
-    username = os.environ["USERNAME_GEO"]
-    print("geoname: ", cityname)
-    params = {
-        "q": city_name,
-        "maxRows": 1,
-        "username": username
-    }
-    try:
-        r = requests.get(url, params=params, timeout=5)
-        data = r.json()
-        if data.get("geonames"):
-            return data["geonames"][0]["countryName"]
-    except Exception as e:
-        print("GeoNames searchJSON error:", e)
-    return None
-# Search for country info using alpha-2/3 codes or name
-def get_country_from_countryinfo(input_code):
-    url = os.environ["URL_COUNTRYJSON"]
-    username = os.environ["USERNAME_GEO"]
-    print("countryINFO: ", input_code)
-    params = {
-        "username": username
-    }
-    try:
-        r = requests.get(url, params=params, timeout=5)
-        data = r.json()
-        if data.get("geonames"):
-            input_code = input_code.strip().upper()
-            for country in data["geonames"]:
-                # Match against country name, country code (alpha-2), iso alpha-3
-                if input_code in [
-                    country.get("countryName", "").upper(),
-                    country.get("countryCode", "").upper(),
-                    country.get("isoAlpha3", "").upper()
-                ]:
-                    return country["countryName"]
-    except Exception as e:
-        print("GeoNames countryInfoJSON error:", e)
-    return None
-# Combined smart lookup
-def smart_country_lookup(user_input):
-    raw_input = user_input.strip()
-    normalized = re.sub(r"[^a-zA-Z0-9]", "", user_input).upper()  # normalize for codes (no strip spaces!)
-    print(raw_input, normalized)
-    # Special case: if user writes "UK: London" → split and take main country part
-    if ":" in raw_input:
-        raw_input = raw_input.split(":")[0].strip()  # only take "UK"
-    # First try as country code (if 2-3 letters or common abbreviation)
-    if len(normalized) <= 3:
-      if normalized.upper() in ["UK","U.K","U.K."]:
-        country = get_country_from_geonames(normalized.upper())
-        print("get_country_from_geonames(normalized.upper()) ", country)
-        if country:
-          return country
-      else:
-        country = get_country_from_countryinfo(raw_input)
-        print("get_country_from_countryinfo(raw_input) ", country)
-        if country:
-            return country
-    print(raw_input)
-    country = get_country_from_countryinfo(raw_input)  # try full names
-    print("get_country_from_countryinfo(raw_input) ", country)
-    if country:
-        return country
-    # Otherwise, treat as city/place
-    country = get_country_from_geonames(raw_input)
-    print("get_country_from_geonames(raw_input) ", country)
-    if country:
-        return country
     return "Not found"

+import requests
+import re
+import os
+# Normalize input
+def normalize_key(text):
+    return re.sub(r"[^a-z0-9]", "", text.strip().lower())
+# Search for city/place (normal flow)
+def get_country_from_geonames(city_name):
+    url = os.environ["URL_SEARCHJSON"]
+    username = os.environ["USERNAME_GEO"]
+    print("geoname: ", cityname)
+    params = {
+        "q": city_name,
+        "maxRows": 1,
+        "username": username
+    }
+    try:
+        r = requests.get(url, params=params, timeout=5)
+        data = r.json()
+        if data.get("geonames"):
+            return data["geonames"][0]["countryName"]
+    except Exception as e:
+        print("GeoNames searchJSON error:", e)
+    return None
+# Search for country info using alpha-2/3 codes or name
+def get_country_from_countryinfo(input_code):
+    url = os.environ["URL_COUNTRYJSON"]
+    username = os.environ["USERNAME_GEO"]
+    print("countryINFO: ", input_code)
+    params = {
+        "username": username
+    }
+    try:
+        r = requests.get(url, params=params, timeout=5)
+        data = r.json()
+        if data.get("geonames"):
+            input_code = input_code.strip().upper()
+            for country in data["geonames"]:
+                # Match against country name, country code (alpha-2), iso alpha-3
+                if input_code in [
+                    country.get("countryName", "").upper(),
+                    country.get("countryCode", "").upper(),
+                    country.get("isoAlpha3", "").upper()
+                ]:
+                    return country["countryName"]
+    except Exception as e:
+        print("GeoNames countryInfoJSON error:", e)
+    return None
+# Combined smart lookup
+def smart_country_lookup(user_input):
+    raw_input = user_input.strip()
+    normalized = re.sub(r"[^a-zA-Z0-9]", "", user_input).upper()  # normalize for codes (no strip spaces!)
+    print(raw_input, normalized)
+    # Special case: if user writes "UK: London" → split and take main country part
+    if ":" in raw_input:
+        raw_input = raw_input.split(":")[0].strip()  # only take "UK"
+    # First try as country code (if 2-3 letters or common abbreviation)
+    if len(normalized) <= 3:
+      if normalized.upper() in ["UK","U.K","U.K."]:
+        country = get_country_from_geonames(normalized.upper())
+        print("get_country_from_geonames(normalized.upper()) ", country)
+        if country:
+          return country
+      else:
+        country = get_country_from_countryinfo(raw_input)
+        print("get_country_from_countryinfo(raw_input) ", country)
+        if country:
+            return country
+    print(raw_input)
+    country = get_country_from_countryinfo(raw_input)  # try full names
+    print("get_country_from_countryinfo(raw_input) ", country)
+    if country:
+        return country
+    # Otherwise, treat as city/place
+    country = get_country_from_geonames(raw_input)
+    print("get_country_from_geonames(raw_input) ", country)
+    if country:
+        return country
     return "Not found"