Spaces:

fdaudens
/

podcast-jobs-rss-test

Sleeping

App Files Files Community

fdaudens HF Staff commited on May 20

Commit

e325224

1 Parent(s): 2ae8913

simplifed date logic

Browse files

Files changed (3) hide show

app.py +1 -1
papers.py +60 -103
run_job.py +1 -1

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ from kokoro import KModel, KPipeline
 from papers import PaperManager
 paper_manager = PaperManager()
-top_papers = paper_manager.get_top_content(hours=24)
 PODCAST_SUBJECT = list(top_papers.values())[0]

 from papers import PaperManager
 paper_manager = PaperManager()
+top_papers = paper_manager.get_top_content()
 PODCAST_SUBJECT = list(top_papers.values())[0]

papers.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import requests
 import tempfile
-from datetime import datetime, timezone
 import base64
 from tqdm.auto import tqdm
 import pymupdf
@@ -9,33 +9,26 @@ import pymupdf
 DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
 class PaperManager:
-    def __init__(self, papers_per_page=30):
-        self.papers = []
-        self.raw_papers = []  # To store fetched data
-    # def calculate_rising_score(self, paper):
-    #     """
-    #     Calculate the rising score of a paper.
-    #     This emphasizes recent upvotes and the rate of upvote accumulation.
-    #     """
-    #     upvotes = paper.get('paper', {}).get('upvotes', 0)
-    #     published_at_str = paper.get('submittedOnDailyAt', datetime.now(timezone.utc).isoformat())
-    #     try:
-    #         published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
-    #     except ValueError:
-    #         published_time = datetime.now(timezone.utc)
-    #     time_diff = datetime.now(timezone.utc) - published_time
-    #     time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
-    #     # Rising score favors papers that are gaining upvotes quickly
-    #     # Adjusted to have a linear decay over time
-    #     score = upvotes / (time_diff_hours + 1)
-    #     return score
-    def fetch_papers(self):
         try:
-            response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
             response.raise_for_status()
             data = response.json()
@@ -44,7 +37,7 @@ class PaperManager:
                 return False
             self.raw_papers = data  # Store raw data
             return True
         except requests.RequestException as e:
@@ -54,39 +47,39 @@ class PaperManager:
             print(f"Unexpected error: {e}")
             return False
-    def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
-        self.papers = []
-        for paper in self.raw_papers:
-            paper_score = self.calculate_rising_score(paper)
-            # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
-            self.papers.append(paper)
-        self.papers = sorted(
-            self.papers,
-            key=lambda x: self.calculate_rising_score(x),
-            reverse=True
-        )[:2]
-        return self.papers
-    # def get_paper_content(self, paper_id):
-    #     pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
-    #     print("Processing paper:", pdf_url)
-    #     client = httpx.Client(follow_redirects=True)
-    #     response = client.get(pdf_url)
-    #     # First verification - check if we got a valid PDF response
-    #     if response.status_code != 200:
-    #         raise Exception(f"Failed to fetch PDF: {response.status_code}")
-    #     if not response.headers.get('content-type', '').startswith('application/pdf'):
-    #         raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
-    #     # Second verification - check the first few bytes of the content
-    #     if not response.content.startswith(b'%PDF'):
-    #         raise Exception("Content doesn't appear to be a valid PDF")
-    #     pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
-    #     return {"pdf": pdf_data, "url": pdf_url}
     def get_paper_text(self, paper_id):
         url = f"https://arxiv.org/pdf/{paper_id}.pdf"
@@ -115,47 +108,11 @@ class PaperManager:
     #         contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
     #     return contents
-    def get_top_content(self, hours=24):
-        """
-        Get the most upvoted paper submitted within the specified hours.
-        """
-        self.fetch_papers()
-        current_time = datetime.now(timezone.utc)
-        # Filter papers by submission time (last 24 hours by default)
-        recent_papers = []
-        for paper in self.raw_papers:
-            submitted_at_str = paper.get('submittedOnDailyAt', current_time.isoformat())
-            try:
-                submitted_time = datetime.fromisoformat(submitted_at_str.replace('Z', '+00:00'))
-                time_diff = current_time - submitted_time
-                # Only include papers submitted within specified hours
-                if time_diff.total_seconds() / 3600 <= hours:
-                    recent_papers.append(paper)
-            except ValueError:
-                # Skip papers with invalid timestamp
-                continue
-        # If papers found in the time window, sort by upvotes
-        if recent_papers:
-            # Sort by raw upvote count (highest first)
-            sorted_papers = sorted(
-                recent_papers,
-                key=lambda x: x.get('paper', {}).get('upvotes', 0),
-                reverse=True
-            )
-            # Take only the top paper
-            self.papers = [sorted_papers[0]] if sorted_papers else []
-        else:
-            print(f"No papers submitted in the last {hours} hours.")
-            self.papers = []
-        # Get content
-        contents = {}
-        print(f"Processing {len(self.papers)} papers:")
-        for paper in tqdm(self.papers):
-            paper_id = paper["paper"]['id']
-            contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
-        return contents

 import os
 import requests
 import tempfile
+from datetime import datetime, timezone, timedelta
 import base64
 from tqdm.auto import tqdm
 import pymupdf
 DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
 class PaperManager:
+    def fetch_papers(self, date=None):
+        """
+        Fetch papers from the API with optional date filtering.
+        Args:
+            date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date.
+        Returns:
+            bool: True if papers were successfully fetched, False otherwise.
+        """
         try:
+            # Use today's date if none provided
+            if date is None:
+                date = datetime.now().strftime('%Y-%m-%d')
+            # Construct the URL with the date parameter
+            url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100"
+            print(f"Fetching papers from: {url}")
+            response = requests.get(url)
             response.raise_for_status()
             data = response.json()
                 return False
             self.raw_papers = data  # Store raw data
+            print(f"Found {len(data)} papers for date {date}")
             return True
         except requests.RequestException as e:
             print(f"Unexpected error: {e}")
             return False
+    def get_top_content(self):
+        """
+        Get the most upvoted paper from today's submissions.
+        Returns:
+            dict: Dictionary mapping paper titles to their contents.
+        """
+        # Fetch papers from today
+        if not self.fetch_papers():
+            return {}
+        # Sort by upvotes
+        if self.raw_papers:
+            sorted_papers = sorted(
+                self.raw_papers,
+                key=lambda x: x.get('paper', {}).get('upvotes', 0),
+                reverse=True
+            )
+            # Take only the top paper
+            self.papers = [sorted_papers[0]] if sorted_papers else []
+        else:
+            print("No papers found for today.")
+            self.papers = []
+        # Get content
+        contents = {}
+        print(f"Processing {len(self.papers)} papers:")
+        for paper in tqdm(self.papers):
+            paper_id = paper["paper"]['id']
+            contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
+        return contents
     def get_paper_text(self, paper_id):
         url = f"https://arxiv.org/pdf/{paper_id}.pdf"
     #         contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
     #     return contents
+# Example usage
+if __name__ == "__main__":
+    paper_manager = PaperManager()
+    top_papers = paper_manager.get_top_content()
+    for title, content in top_papers.items():
+        print(f"Title: {title}")
+        print(f"Content: {content[:100]}...")  # Print first 100 characters of content

run_job.py CHANGED Viewed

@@ -54,7 +54,7 @@ def main():
     # 1. Get the most popular paper's content
     paper_manager = PaperManager()
-    top_papers = paper_manager.get_top_content(hours=24)
     # Get the first (most popular) paper's text
     subject = list(top_papers.values())[0]

     # 1. Get the most popular paper's content
     paper_manager = PaperManager()
+    top_papers = paper_manager.get_top_content()
     # Get the first (most popular) paper's text
     subject = list(top_papers.values())[0]