Spaces:
Build error
Build error
from distutils.command.config import config | |
import requests | |
from time import sleep | |
import trafilatura | |
from trafilatura.meta import reset_caches | |
from trafilatura.settings import DEFAULT_CONFIG | |
import spacy | |
import os | |
os.system("python -m spacy download en_core_web_sm") | |
nlp = spacy.load('en_core_web_sm') | |
import sys | |
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 | |
def get_page(url): | |
page = None | |
for i in range(3): | |
try: | |
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) | |
assert page is not None | |
print("Fetched "+url, file=sys.stderr) | |
break | |
except: | |
sleep(3) | |
return page | |
def url2lines(url): | |
page = get_page(url) | |
if page is None: | |
return [] | |
lines = html2lines(page) | |
return lines | |
def line_correction(lines, max_size=100): | |
out_lines = [] | |
for line in lines: | |
if len(line) < 4: | |
continue | |
if len(line) > max_size: | |
doc = nlp(line[:5000]) # We split lines into sentences, but for performance we take only the first 5k characters per line | |
stack = "" | |
for sent in doc.sents: | |
if len(stack) > 0: | |
stack += " " | |
stack += str(sent).strip() | |
if len(stack) > max_size: | |
out_lines.append(stack) | |
stack = "" | |
if len(stack) > 0: | |
out_lines.append(stack) | |
else: | |
out_lines.append(line) | |
return out_lines | |
def html2lines(page): | |
out_lines = [] | |
if len(page.strip()) == 0 or page is None: | |
return out_lines | |
text = trafilatura.extract(page, config=DEFAULT_CONFIG) | |
reset_caches() | |
if text is None: | |
return out_lines | |
return text.split("\n") # We just spit out the entire page, so need to reformat later. | |