VyLala commited on
Commit
de2ea69
Β·
verified Β·
1 Parent(s): a2b74df

Update smart_fallback.py

Browse files
Files changed (1) hide show
  1. smart_fallback.py +16 -1
smart_fallback.py CHANGED
@@ -147,7 +147,7 @@ def smart_google_queries(metadata: dict):
147
 
148
  return queries
149
 
150
- def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
151
  TRUSTED_DOMAINS = [
152
  "ncbi.nlm.nih.gov",
153
  "pubmed.ncbi.nlm.nih.gov",
@@ -157,6 +157,9 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
157
  "nature.com",
158
  "sciencedirect.com"
159
  ]
 
 
 
160
  def is_trusted_link(link):
161
  for domain in TRUSTED_DOMAINS:
162
  if domain in link:
@@ -170,6 +173,9 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
170
  title_snippet = link.lower()
171
  print("save link folder inside this filter function: ", saveLinkFolder)
172
  success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
 
 
 
173
  if success_process:
174
  article_text = output_process
175
  print("yes succeed for getting article text")
@@ -179,10 +185,16 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
179
  #article_text = data_preprocess.extract_text(link,saveLinkFolder)
180
  print("article text")
181
  #print(article_text)
 
 
 
182
  try:
183
  ext = link.split(".")[-1].lower()
184
  if ext not in ["pdf", "docx", "xlsx"]:
185
  html = extractHTML.HTML("", link)
 
 
 
186
  jsonSM = html.getSupMaterial()
187
  if jsonSM:
188
  output += sum((jsonSM[key] for key in jsonSM), [])
@@ -210,6 +222,9 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
210
  # filtered.append(link)
211
  # else:
212
  print(link)
 
 
 
213
  if link:
214
  output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
215
  print("output link: ")
 
147
 
148
  return queries
149
 
150
+ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
151
  TRUSTED_DOMAINS = [
152
  "ncbi.nlm.nih.gov",
153
  "pubmed.ncbi.nlm.nih.gov",
 
157
  "nature.com",
158
  "sciencedirect.com"
159
  ]
160
+ if stop_flag is not None and stop_flag.value:
161
+ print(f"πŸ›‘ Stop detected {accession}, aborting early...")
162
+ return []
163
  def is_trusted_link(link):
164
  for domain in TRUSTED_DOMAINS:
165
  if domain in link:
 
173
  title_snippet = link.lower()
174
  print("save link folder inside this filter function: ", saveLinkFolder)
175
  success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
176
+ if stop_flag is not None and stop_flag.value:
177
+ print(f"πŸ›‘ Stop detected {accession}, aborting early...")
178
+ return []
179
  if success_process:
180
  article_text = output_process
181
  print("yes succeed for getting article text")
 
185
  #article_text = data_preprocess.extract_text(link,saveLinkFolder)
186
  print("article text")
187
  #print(article_text)
188
+ if stop_flag is not None and stop_flag.value:
189
+ print(f"πŸ›‘ Stop detected {accession}, aborting early...")
190
+ return []
191
  try:
192
  ext = link.split(".")[-1].lower()
193
  if ext not in ["pdf", "docx", "xlsx"]:
194
  html = extractHTML.HTML("", link)
195
+ if stop_flag is not None and stop_flag.value:
196
+ print(f"πŸ›‘ Stop detected {accession}, aborting early...")
197
+ return []
198
  jsonSM = html.getSupMaterial()
199
  if jsonSM:
200
  output += sum((jsonSM[key] for key in jsonSM), [])
 
222
  # filtered.append(link)
223
  # else:
224
  print(link)
225
+ if stop_flag is not None and stop_flag.value:
226
+ print(f"πŸ›‘ Stop detected {accession}, aborting early...")
227
+ return []
228
  if link:
229
  output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
230
  print("output link: ")