fdaudens HF Staff commited on
Commit
b358061
·
1 Parent(s): a669f07

rewind papers py

Browse files
Files changed (1) hide show
  1. papers.py +112 -24
papers.py CHANGED
@@ -1,28 +1,116 @@
 
1
  import requests
 
 
 
 
 
2
 
3
  DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
4
 
5
- def get_most_upvoted_paper():
6
- try:
7
- response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
8
- response.raise_for_status()
9
- data = response.json()
10
- if not data:
11
- print("No data received from API.")
12
- return None
13
- # Find the paper with the most upvotes
14
- most_upvoted = max(data, key=lambda p: p.get('paper', {}).get('upvotes', 0))
15
- return most_upvoted
16
- except requests.RequestException as e:
17
- print(f"Error fetching papers: {e}")
18
- return None
19
- except Exception as e:
20
- print(f"Unexpected error: {e}")
21
- return None
22
-
23
- if __name__ == "__main__":
24
- paper = get_most_upvoted_paper()
25
- if paper:
26
- print(f"Most upvoted paper: {paper['paper']['title']} (Upvotes: {paper['paper']['upvotes']})")
27
- else:
28
- print("No paper found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import requests
3
+ import tempfile
4
+ from datetime import datetime, timezone
5
+ import base64
6
+ from tqdm.auto import tqdm
7
+ import pymupdf
8
 
9
  DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
10
 
11
+ class PaperManager:
12
+ def __init__(self, papers_per_page=30):
13
+ self.papers = []
14
+ self.raw_papers = [] # To store fetched data
15
+
16
+ def calculate_rising_score(self, paper):
17
+ """
18
+ Calculate the rising score of a paper.
19
+ This emphasizes recent upvotes and the rate of upvote accumulation.
20
+ """
21
+ upvotes = paper.get('paper', {}).get('upvotes', 0)
22
+ published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
23
+ try:
24
+ published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
25
+ except ValueError:
26
+ published_time = datetime.now(timezone.utc)
27
+
28
+ time_diff = datetime.now(timezone.utc) - published_time
29
+ time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
30
+
31
+ # Rising score favors papers that are gaining upvotes quickly
32
+ # Adjusted to have a linear decay over time
33
+ score = upvotes / (time_diff_hours + 1)
34
+ return score
35
+
36
+ def fetch_papers(self):
37
+ try:
38
+ response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
39
+ response.raise_for_status()
40
+ data = response.json()
41
+
42
+ if not data:
43
+ print("No data received from API.")
44
+ return False
45
+
46
+ self.raw_papers = data # Store raw data
47
+
48
+ return True
49
+
50
+ except requests.RequestException as e:
51
+ print(f"Error fetching papers: {e}")
52
+ return False
53
+ except Exception as e:
54
+ print(f"Unexpected error: {e}")
55
+ return False
56
+
57
+ def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
58
+ self.papers = []
59
+ for paper in self.raw_papers:
60
+ paper_score = self.calculate_rising_score(paper)
61
+ # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
62
+ self.papers.append(paper)
63
+
64
+ self.papers = sorted(
65
+ self.papers,
66
+ key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
67
+ reverse=True
68
+ )[:2]
69
+ return self.papers
70
+
71
+ # def get_paper_content(self, paper_id):
72
+ # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
73
+ # print("Processing paper:", pdf_url)
74
+ # client = httpx.Client(follow_redirects=True)
75
+ # response = client.get(pdf_url)
76
+
77
+ # # First verification - check if we got a valid PDF response
78
+ # if response.status_code != 200:
79
+ # raise Exception(f"Failed to fetch PDF: {response.status_code}")
80
+
81
+ # if not response.headers.get('content-type', '').startswith('application/pdf'):
82
+ # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
83
+
84
+ # # Second verification - check the first few bytes of the content
85
+ # if not response.content.startswith(b'%PDF'):
86
+ # raise Exception("Content doesn't appear to be a valid PDF")
87
+
88
+ # pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
89
+ # return {"pdf": pdf_data, "url": pdf_url}
90
+
91
+ def get_paper_text(self, paper_id):
92
+ url = f"https://arxiv.org/pdf/{paper_id}.pdf"
93
+ response = requests.get(url)
94
+
95
+ if response.status_code != 200:
96
+ raise Exception(f"Failed to download PDF: {response.status_code}")
97
+
98
+ with open("temp.pdf", "wb") as f:
99
+ f.write(response.content)
100
+
101
+ with pymupdf.open("temp.pdf") as doc:
102
+ text = ""
103
+ for page in doc:
104
+ text += page.get_text()
105
+ return text
106
+
107
+
108
+ def get_top_content(self):
109
+ self.fetch_papers()
110
+ self.filter_top_papers()
111
+ contents = {}
112
+ print(f"Processing {len(self.papers)} papers:")
113
+ for paper in tqdm(self.papers):
114
+ paper_id = paper["paper"]['id']
115
+ contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
116
+ return contents