podcast-jobs / papers.py
fdaudens's picture
add paper link
8dbb6cc
raw
history blame
3.54 kB
import os
import requests
import tempfile
from datetime import datetime, timezone, timedelta
import base64
from tqdm.auto import tqdm
import pymupdf
DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
class PaperManager:
def fetch_papers(self, date=None):
"""
Fetch papers from the API with optional date filtering.
Args:
date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date.
Returns:
bool: True if papers were successfully fetched, False otherwise.
"""
try:
# Use today's date if none provided
if date is None:
date = datetime.now().strftime('%Y-%m-%d')
# Construct the URL with the date parameter
url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100"
print(f"Fetching papers from: {url}")
response = requests.get(url)
response.raise_for_status()
data = response.json()
if not data:
print("No data received from API.")
return False
self.raw_papers = data # Store raw data
print(f"Found {len(data)} papers for date {date}")
return True
except requests.RequestException as e:
print(f"Error fetching papers: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
def get_top_content(self):
"""
Get the most upvoted paper from today's submissions.
Returns:
dict: Dictionary mapping paper titles to their contents.
"""
# Fetch papers from today
if not self.fetch_papers():
return {}
# Sort by upvotes
if self.raw_papers:
sorted_papers = sorted(
self.raw_papers,
key=lambda x: x.get('paper', {}).get('upvotes', 0),
reverse=True
)
# Take only the top paper
self.papers = [sorted_papers[0]] if sorted_papers else []
else:
print("No papers found for today.")
self.papers = []
# Get content
contents = {}
print(f"Processing {len(self.papers)} papers:")
for paper in tqdm(self.papers):
paper_id = paper["paper"]['id']
content = self.get_paper_text(paper_id)
contents[paper["paper"]['title']] = {"id": paper_id, "content": content}
return contents
def get_paper_text(self, paper_id):
url = f"https://arxiv.org/pdf/{paper_id}.pdf"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to download PDF: {response.status_code}")
with open("temp.pdf", "wb") as f:
f.write(response.content)
with pymupdf.open("temp.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
return text
# def get_top_content(self):
# self.fetch_papers()
# self.filter_top_papers()
# contents = {}
# print(f"Processing {len(self.papers)} papers:")
# for paper in tqdm(self.papers):
# paper_id = paper["paper"]['id']
# contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
# return contents