HuggingFaceWeeklyPaper / fetch_paper.py
HowardZhangdqs's picture
chore: remove label
d91c7bd
from parser import parse_article, Article
from ai.classify_paper import classify_papers
import os
import requests
import datetime
import hashlib
import json
from rich import print
from date import Date
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict
API_URL = "https://huggingface.co/api/daily_papers"
cache = {}
cache_expiry = {}
def make_request(url: str):
# Create a hash of the URL to use as the cache key
url_hash = hashlib.md5(url.encode()).hexdigest()
current_time = datetime.datetime.now()
# Check if the response is already cached and not expired
if url_hash in cache and (current_time - cache_expiry[url_hash]).seconds < 3600:
print(f"Cache hit for URL: {url}")
return cache[url_hash]
http_proxy = os.getenv("HF_HTTP_PROXY")
https_proxy = os.getenv("HF_HTTPS_PROXY")
proxies = {
"http": http_proxy,
"https": https_proxy
} if http_proxy or https_proxy else None
attempts = 0
while attempts < 3:
try:
response = requests.get(url, proxies=proxies)
response.raise_for_status()
data = response.json()
# Cache the response and set the expiry time
cache[url_hash] = data
cache_expiry[url_hash] = current_time
return data
except requests.RequestException as e:
attempts += 1
print(f"Attempt {attempts} failed: {e}")
if attempts == 3:
return []
def fetch_papers():
data = make_request(API_URL)
return [parse_article(item) for item in data]
def fetch_papers_with_date(date: datetime):
formatted_date = str(date)
data = make_request(API_URL + "?date=" + formatted_date)
return [parse_article(item) for item in data]
def fetch_papers_with_daterange(start_date: Date, end_date: Date):
articles: List[Article] = []
current_date = start_date
dates = []
while current_date <= end_date:
dates.append(current_date)
current_date += 1
def fetch_for_date(date):
print(date)
if date == Date():
print("Fetching papers for today")
return fetch_papers()
else:
print(f"Fetching papers for {date}")
return fetch_papers_with_date(date)
with ThreadPoolExecutor(max_workers=8) as executor:
future_to_date = {executor.submit(fetch_for_date, date): date for date in dates}
for future in as_completed(future_to_date):
date = future_to_date[future]
try:
articles.extend(future.result())
except Exception as e:
print(f"Error fetching articles for date {date}: {e}")
# articles = [article for article in articles if (start_date <= Date(article.publishedAt.isoformat().split('T')[0]) <= end_date)]
unique_articles: Dict[str, Article] = {}
for article in articles:
if article.paper.id not in unique_articles:
unique_articles[article.paper.id] = article
print(f"Unique articles: {len(unique_articles)}")
preprocessed_articles: List[Article] = list(unique_articles.values())
preprocessed_articles = list(map(lambda article: {
"title": article.title,
"abstract": article.paper.summary,
"id": article.paper.id
}, preprocessed_articles))
# classified_articles = classify_papers(preprocessed_articles)
# 遍历 classified_articles,将分类结果写入到 unique_articles 中
# for article in classified_articles:
# unique_articles[article["id"]].paper.label = article["category"]
return list(unique_articles.values())
if __name__ == "__main__":
from rich import print
start_date = Date(2025, 1, 21)
end_date = Date(2025, 2, 1)
articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
# print(articles)
print(f"Total articles: {len(articles)}")