fdaudens HF Staff commited on
Commit
e325224
·
1 Parent(s): 2ae8913

simplifed date logic

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. papers.py +60 -103
  3. run_job.py +1 -1
app.py CHANGED
@@ -31,7 +31,7 @@ from kokoro import KModel, KPipeline
31
  from papers import PaperManager
32
 
33
  paper_manager = PaperManager()
34
- top_papers = paper_manager.get_top_content(hours=24)
35
 
36
  PODCAST_SUBJECT = list(top_papers.values())[0]
37
 
 
31
  from papers import PaperManager
32
 
33
  paper_manager = PaperManager()
34
+ top_papers = paper_manager.get_top_content()
35
 
36
  PODCAST_SUBJECT = list(top_papers.values())[0]
37
 
papers.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import requests
3
  import tempfile
4
- from datetime import datetime, timezone
5
  import base64
6
  from tqdm.auto import tqdm
7
  import pymupdf
@@ -9,33 +9,26 @@ import pymupdf
9
  DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
10
 
11
  class PaperManager:
12
- def __init__(self, papers_per_page=30):
13
- self.papers = []
14
- self.raw_papers = [] # To store fetched data
15
-
16
- # def calculate_rising_score(self, paper):
17
- # """
18
- # Calculate the rising score of a paper.
19
- # This emphasizes recent upvotes and the rate of upvote accumulation.
20
- # """
21
- # upvotes = paper.get('paper', {}).get('upvotes', 0)
22
- # published_at_str = paper.get('submittedOnDailyAt', datetime.now(timezone.utc).isoformat())
23
- # try:
24
- # published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
25
- # except ValueError:
26
- # published_time = datetime.now(timezone.utc)
27
-
28
- # time_diff = datetime.now(timezone.utc) - published_time
29
- # time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
30
-
31
- # # Rising score favors papers that are gaining upvotes quickly
32
- # # Adjusted to have a linear decay over time
33
- # score = upvotes / (time_diff_hours + 1)
34
- # return score
35
-
36
- def fetch_papers(self):
37
  try:
38
- response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
 
 
 
 
 
 
 
 
39
  response.raise_for_status()
40
  data = response.json()
41
 
@@ -44,7 +37,7 @@ class PaperManager:
44
  return False
45
 
46
  self.raw_papers = data # Store raw data
47
-
48
  return True
49
 
50
  except requests.RequestException as e:
@@ -54,39 +47,39 @@ class PaperManager:
54
  print(f"Unexpected error: {e}")
55
  return False
56
 
57
- def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
58
- self.papers = []
59
- for paper in self.raw_papers:
60
- paper_score = self.calculate_rising_score(paper)
61
- # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
62
- self.papers.append(paper)
63
-
64
- self.papers = sorted(
65
- self.papers,
66
- key=lambda x: self.calculate_rising_score(x),
67
- reverse=True
68
- )[:2]
69
- return self.papers
70
-
71
- # def get_paper_content(self, paper_id):
72
- # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
73
- # print("Processing paper:", pdf_url)
74
- # client = httpx.Client(follow_redirects=True)
75
- # response = client.get(pdf_url)
76
-
77
- # # First verification - check if we got a valid PDF response
78
- # if response.status_code != 200:
79
- # raise Exception(f"Failed to fetch PDF: {response.status_code}")
80
 
81
- # if not response.headers.get('content-type', '').startswith('application/pdf'):
82
- # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
 
 
 
 
83
 
84
- # # Second verification - check the first few bytes of the content
85
- # if not response.content.startswith(b'%PDF'):
86
- # raise Exception("Content doesn't appear to be a valid PDF")
87
-
88
- # pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
89
- # return {"pdf": pdf_data, "url": pdf_url}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def get_paper_text(self, paper_id):
92
  url = f"https://arxiv.org/pdf/{paper_id}.pdf"
@@ -115,47 +108,11 @@ class PaperManager:
115
  # contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
116
  # return contents
117
 
118
- def get_top_content(self, hours=24):
119
- """
120
- Get the most upvoted paper submitted within the specified hours.
121
- """
122
- self.fetch_papers()
123
- current_time = datetime.now(timezone.utc)
124
-
125
- # Filter papers by submission time (last 24 hours by default)
126
- recent_papers = []
127
- for paper in self.raw_papers:
128
- submitted_at_str = paper.get('submittedOnDailyAt', current_time.isoformat())
129
- try:
130
- submitted_time = datetime.fromisoformat(submitted_at_str.replace('Z', '+00:00'))
131
- time_diff = current_time - submitted_time
132
- # Only include papers submitted within specified hours
133
- if time_diff.total_seconds() / 3600 <= hours:
134
- recent_papers.append(paper)
135
- except ValueError:
136
- # Skip papers with invalid timestamp
137
- continue
138
-
139
- # If papers found in the time window, sort by upvotes
140
- if recent_papers:
141
- # Sort by raw upvote count (highest first)
142
- sorted_papers = sorted(
143
- recent_papers,
144
- key=lambda x: x.get('paper', {}).get('upvotes', 0),
145
- reverse=True
146
- )
147
-
148
- # Take only the top paper
149
- self.papers = [sorted_papers[0]] if sorted_papers else []
150
- else:
151
- print(f"No papers submitted in the last {hours} hours.")
152
- self.papers = []
153
-
154
- # Get content
155
- contents = {}
156
- print(f"Processing {len(self.papers)} papers:")
157
- for paper in tqdm(self.papers):
158
- paper_id = paper["paper"]['id']
159
- contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
160
-
161
- return contents
 
1
  import os
2
  import requests
3
  import tempfile
4
+ from datetime import datetime, timezone, timedelta
5
  import base64
6
  from tqdm.auto import tqdm
7
  import pymupdf
 
9
  DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
10
 
11
  class PaperManager:
12
+ def fetch_papers(self, date=None):
13
+ """
14
+ Fetch papers from the API with optional date filtering.
15
+
16
+ Args:
17
+ date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date.
18
+
19
+ Returns:
20
+ bool: True if papers were successfully fetched, False otherwise.
21
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
+ # Use today's date if none provided
24
+ if date is None:
25
+ date = datetime.now().strftime('%Y-%m-%d')
26
+
27
+ # Construct the URL with the date parameter
28
+ url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100"
29
+
30
+ print(f"Fetching papers from: {url}")
31
+ response = requests.get(url)
32
  response.raise_for_status()
33
  data = response.json()
34
 
 
37
  return False
38
 
39
  self.raw_papers = data # Store raw data
40
+ print(f"Found {len(data)} papers for date {date}")
41
  return True
42
 
43
  except requests.RequestException as e:
 
47
  print(f"Unexpected error: {e}")
48
  return False
49
 
50
+ def get_top_content(self):
51
+ """
52
+ Get the most upvoted paper from today's submissions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ Returns:
55
+ dict: Dictionary mapping paper titles to their contents.
56
+ """
57
+ # Fetch papers from today
58
+ if not self.fetch_papers():
59
+ return {}
60
 
61
+ # Sort by upvotes
62
+ if self.raw_papers:
63
+ sorted_papers = sorted(
64
+ self.raw_papers,
65
+ key=lambda x: x.get('paper', {}).get('upvotes', 0),
66
+ reverse=True
67
+ )
68
+
69
+ # Take only the top paper
70
+ self.papers = [sorted_papers[0]] if sorted_papers else []
71
+ else:
72
+ print("No papers found for today.")
73
+ self.papers = []
74
+
75
+ # Get content
76
+ contents = {}
77
+ print(f"Processing {len(self.papers)} papers:")
78
+ for paper in tqdm(self.papers):
79
+ paper_id = paper["paper"]['id']
80
+ contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
81
+
82
+ return contents
83
 
84
  def get_paper_text(self, paper_id):
85
  url = f"https://arxiv.org/pdf/{paper_id}.pdf"
 
108
  # contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
109
  # return contents
110
 
111
+
112
+ # Example usage
113
+ if __name__ == "__main__":
114
+ paper_manager = PaperManager()
115
+ top_papers = paper_manager.get_top_content()
116
+ for title, content in top_papers.items():
117
+ print(f"Title: {title}")
118
+ print(f"Content: {content[:100]}...") # Print first 100 characters of content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
run_job.py CHANGED
@@ -54,7 +54,7 @@ def main():
54
 
55
  # 1. Get the most popular paper's content
56
  paper_manager = PaperManager()
57
- top_papers = paper_manager.get_top_content(hours=24)
58
  # Get the first (most popular) paper's text
59
  subject = list(top_papers.values())[0]
60
 
 
54
 
55
  # 1. Get the most popular paper's content
56
  paper_manager = PaperManager()
57
+ top_papers = paper_manager.get_top_content()
58
  # Get the first (most popular) paper's text
59
  subject = list(top_papers.values())[0]
60