Spaces:

SmartFlowAI
/

HuggingFaceWeeklyPaper

Running

App Files Files Community

HuggingFaceWeeklyPaper / fetch_paper.py

HowardZhangdqs

chore: remove label

d91c7bd 2 days ago

raw

history blame contribute delete

3.98 kB

	from parser import parse_article, Article
	from ai.classify_paper import classify_papers
	import os
	import requests
	import datetime
	import hashlib
	import json
	from rich import print
	from date import Date
	from concurrent.futures import ThreadPoolExecutor, as_completed

	from typing import List, Dict


	API_URL = "https://huggingface.co/api/daily_papers"

	cache = {}
	cache_expiry = {}


	def make_request(url: str):
	# Create a hash of the URL to use as the cache key
	url_hash = hashlib.md5(url.encode()).hexdigest()
	current_time = datetime.datetime.now()

	# Check if the response is already cached and not expired
	if url_hash in cache and (current_time - cache_expiry[url_hash]).seconds < 3600:
	print(f"Cache hit for URL: {url}")
	return cache[url_hash]

	http_proxy = os.getenv("HF_HTTP_PROXY")
	https_proxy = os.getenv("HF_HTTPS_PROXY")
	proxies = {
	"http": http_proxy,
	"https": https_proxy
	} if http_proxy or https_proxy else None

	attempts = 0
	while attempts < 3:
	try:
	response = requests.get(url, proxies=proxies)
	response.raise_for_status()
	data = response.json()

	# Cache the response and set the expiry time
	cache[url_hash] = data
	cache_expiry[url_hash] = current_time

	return data
	except requests.RequestException as e:
	attempts += 1
	print(f"Attempt {attempts} failed: {e}")
	if attempts == 3:
	return []


	def fetch_papers():
	data = make_request(API_URL)
	return [parse_article(item) for item in data]


	def fetch_papers_with_date(date: datetime):
	formatted_date = str(date)
	data = make_request(API_URL + "?date=" + formatted_date)
	return [parse_article(item) for item in data]


	def fetch_papers_with_daterange(start_date: Date, end_date: Date):
	articles: List[Article] = []
	current_date = start_date
	dates = []

	while current_date <= end_date:
	dates.append(current_date)
	current_date += 1

	def fetch_for_date(date):
	print(date)
	if date == Date():
	print("Fetching papers for today")
	return fetch_papers()
	else:
	print(f"Fetching papers for {date}")
	return fetch_papers_with_date(date)

	with ThreadPoolExecutor(max_workers=8) as executor:
	future_to_date = {executor.submit(fetch_for_date, date): date for date in dates}
	for future in as_completed(future_to_date):
	date = future_to_date[future]
	try:
	articles.extend(future.result())
	except Exception as e:
	print(f"Error fetching articles for date {date}: {e}")

	# articles = [article for article in articles if (start_date <= Date(article.publishedAt.isoformat().split('T')[0]) <= end_date)]

	unique_articles: Dict[str, Article] = {}
	for article in articles:
	if article.paper.id not in unique_articles:
	unique_articles[article.paper.id] = article

	print(f"Unique articles: {len(unique_articles)}")

	preprocessed_articles: List[Article] = list(unique_articles.values())

	preprocessed_articles = list(map(lambda article: {
	"title": article.title,
	"abstract": article.paper.summary,
	"id": article.paper.id
	}, preprocessed_articles))

	# classified_articles = classify_papers(preprocessed_articles)

	# 遍历 classified_articles，将分类结果写入到 unique_articles 中
	# for article in classified_articles:
	# unique_articles[article["id"]].paper.label = article["category"]

	return list(unique_articles.values())


	if __name__ == "__main__":
	from rich import print
	start_date = Date(2025, 1, 21)
	end_date = Date(2025, 2, 1)
	articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
	# print(articles)
	print(f"Total articles: {len(articles)}")