from parser import parse_article, Article from ai.classify_paper import classify_papers import os import requests import datetime import hashlib import json from rich import print from date import Date from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict API_URL = "https://huggingface.co/api/daily_papers" cache = {} cache_expiry = {} def make_request(url: str): # Create a hash of the URL to use as the cache key url_hash = hashlib.md5(url.encode()).hexdigest() current_time = datetime.datetime.now() # Check if the response is already cached and not expired if url_hash in cache and (current_time - cache_expiry[url_hash]).seconds < 3600: print(f"Cache hit for URL: {url}") return cache[url_hash] http_proxy = os.getenv("HF_HTTP_PROXY") https_proxy = os.getenv("HF_HTTPS_PROXY") proxies = { "http": http_proxy, "https": https_proxy } if http_proxy or https_proxy else None attempts = 0 while attempts < 3: try: response = requests.get(url, proxies=proxies) response.raise_for_status() data = response.json() # Cache the response and set the expiry time cache[url_hash] = data cache_expiry[url_hash] = current_time return data except requests.RequestException as e: attempts += 1 print(f"Attempt {attempts} failed: {e}") if attempts == 3: return [] def fetch_papers(): data = make_request(API_URL) return [parse_article(item) for item in data] def fetch_papers_with_date(date: datetime): formatted_date = str(date) data = make_request(API_URL + "?date=" + formatted_date) return [parse_article(item) for item in data] def fetch_papers_with_daterange(start_date: Date, end_date: Date): articles: List[Article] = [] current_date = start_date dates = [] while current_date <= end_date: dates.append(current_date) current_date += 1 def fetch_for_date(date): print(date) if date == Date(): print("Fetching papers for today") return fetch_papers() else: print(f"Fetching papers for {date}") return fetch_papers_with_date(date) with ThreadPoolExecutor(max_workers=8) as executor: future_to_date = {executor.submit(fetch_for_date, date): date for date in dates} for future in as_completed(future_to_date): date = future_to_date[future] try: articles.extend(future.result()) except Exception as e: print(f"Error fetching articles for date {date}: {e}") # articles = [article for article in articles if (start_date <= Date(article.publishedAt.isoformat().split('T')[0]) <= end_date)] unique_articles: Dict[str, Article] = {} for article in articles: if article.paper.id not in unique_articles: unique_articles[article.paper.id] = article print(f"Unique articles: {len(unique_articles)}") preprocessed_articles: List[Article] = list(unique_articles.values()) preprocessed_articles = list(map(lambda article: { "title": article.title, "abstract": article.paper.summary, "id": article.paper.id }, preprocessed_articles)) # classified_articles = classify_papers(preprocessed_articles) # 遍历 classified_articles,将分类结果写入到 unique_articles 中 # for article in classified_articles: # unique_articles[article["id"]].paper.label = article["category"] return list(unique_articles.values()) if __name__ == "__main__": from rich import print start_date = Date(2025, 1, 21) end_date = Date(2025, 2, 1) articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date) # print(articles) print(f"Total articles: {len(articles)}")