Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| import tempfile | |
| from datetime import datetime, timezone, timedelta | |
| import base64 | |
| from tqdm.auto import tqdm | |
| import pymupdf | |
| DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers" | |
| class PaperManager: | |
| def fetch_papers(self, date=None): | |
| """ | |
| Fetch papers from the API with optional date filtering. | |
| Args: | |
| date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date. | |
| Returns: | |
| bool: True if papers were successfully fetched, False otherwise. | |
| """ | |
| try: | |
| # Use today's date if none provided | |
| if date is None: | |
| date = datetime.now().strftime('%Y-%m-%d') | |
| # Construct the URL with the date parameter | |
| url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100" | |
| print(f"Fetching papers from: {url}") | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| data = response.json() | |
| if not data: | |
| print("No data received from API.") | |
| return False | |
| self.raw_papers = data # Store raw data | |
| print(f"Found {len(data)} papers for date {date}") | |
| return True | |
| except requests.RequestException as e: | |
| print(f"Error fetching papers: {e}") | |
| return False | |
| except Exception as e: | |
| print(f"Unexpected error: {e}") | |
| return False | |
| def get_top_content(self): | |
| """ | |
| Get the most upvoted paper from today's submissions. | |
| Returns: | |
| dict: Dictionary mapping paper titles to their contents. | |
| """ | |
| # Fetch papers from today | |
| if not self.fetch_papers(): | |
| return {} | |
| # Sort by upvotes | |
| if self.raw_papers: | |
| sorted_papers = sorted( | |
| self.raw_papers, | |
| key=lambda x: x.get('paper', {}).get('upvotes', 0), | |
| reverse=True | |
| ) | |
| # Take only the top paper | |
| self.papers = [sorted_papers[0]] if sorted_papers else [] | |
| else: | |
| print("No papers found for today.") | |
| self.papers = [] | |
| # Get content | |
| contents = {} | |
| print(f"Processing {len(self.papers)} papers:") | |
| for paper in tqdm(self.papers): | |
| paper_id = paper["paper"]['id'] | |
| content = self.get_paper_text(paper_id) | |
| contents[paper["paper"]['title']] = {"id": paper_id, "content": content} | |
| return contents | |
| def get_paper_text(self, paper_id): | |
| url = f"https://arxiv.org/pdf/{paper_id}.pdf" | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to download PDF: {response.status_code}") | |
| with open("temp.pdf", "wb") as f: | |
| f.write(response.content) | |
| with pymupdf.open("temp.pdf") as doc: | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| # def get_top_content(self): | |
| # self.fetch_papers() | |
| # self.filter_top_papers() | |
| # contents = {} | |
| # print(f"Processing {len(self.papers)} papers:") | |
| # for paper in tqdm(self.papers): | |
| # paper_id = paper["paper"]['id'] | |
| # contents[paper["paper"]['title']] = self.get_paper_text(paper_id) | |
| # return contents | |