fdaudens HF Staff commited on
Commit
a668eca
·
1 Parent(s): 547fef1

save audio file

Browse files
Files changed (3) hide show
  1. papers.py +24 -112
  2. podcasts/requirements.txt +8 -0
  3. run_job.py +25 -1
papers.py CHANGED
@@ -1,116 +1,28 @@
1
- import os
2
  import requests
3
- import tempfile
4
- from datetime import datetime, timezone
5
- import base64
6
- from tqdm.auto import tqdm
7
- import pymupdf
8
 
9
  DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
10
 
11
- class PaperManager:
12
- def __init__(self, papers_per_page=30):
13
- self.papers = []
14
- self.raw_papers = [] # To store fetched data
15
-
16
- def calculate_rising_score(self, paper):
17
- """
18
- Calculate the rising score of a paper.
19
- This emphasizes recent upvotes and the rate of upvote accumulation.
20
- """
21
- upvotes = paper.get('paper', {}).get('upvotes', 0)
22
- published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
23
- try:
24
- published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
25
- except ValueError:
26
- published_time = datetime.now(timezone.utc)
27
-
28
- time_diff = datetime.now(timezone.utc) - published_time
29
- time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
30
-
31
- # Rising score favors papers that are gaining upvotes quickly
32
- # Adjusted to have a linear decay over time
33
- score = upvotes / (time_diff_hours + 1)
34
- return score
35
-
36
- def fetch_papers(self):
37
- try:
38
- response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
39
- response.raise_for_status()
40
- data = response.json()
41
-
42
- if not data:
43
- print("No data received from API.")
44
- return False
45
-
46
- self.raw_papers = data # Store raw data
47
-
48
- return True
49
-
50
- except requests.RequestException as e:
51
- print(f"Error fetching papers: {e}")
52
- return False
53
- except Exception as e:
54
- print(f"Unexpected error: {e}")
55
- return False
56
-
57
- def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
58
- self.papers = []
59
- for paper in self.raw_papers:
60
- paper_score = self.calculate_rising_score(paper)
61
- # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
62
- self.papers.append(paper)
63
-
64
- self.papers = sorted(
65
- self.papers,
66
- key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
67
- reverse=True
68
- )[:2]
69
- return self.papers
70
-
71
- # def get_paper_content(self, paper_id):
72
- # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
73
- # print("Processing paper:", pdf_url)
74
- # client = httpx.Client(follow_redirects=True)
75
- # response = client.get(pdf_url)
76
-
77
- # # First verification - check if we got a valid PDF response
78
- # if response.status_code != 200:
79
- # raise Exception(f"Failed to fetch PDF: {response.status_code}")
80
-
81
- # if not response.headers.get('content-type', '').startswith('application/pdf'):
82
- # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
83
-
84
- # # Second verification - check the first few bytes of the content
85
- # if not response.content.startswith(b'%PDF'):
86
- # raise Exception("Content doesn't appear to be a valid PDF")
87
-
88
- # pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
89
- # return {"pdf": pdf_data, "url": pdf_url}
90
-
91
- def get_paper_text(self, paper_id):
92
- url = f"https://arxiv.org/pdf/{paper_id}.pdf"
93
- response = requests.get(url)
94
-
95
- if response.status_code != 200:
96
- raise Exception(f"Failed to download PDF: {response.status_code}")
97
-
98
- with open("temp.pdf", "wb") as f:
99
- f.write(response.content)
100
-
101
- with pymupdf.open("temp.pdf") as doc:
102
- text = ""
103
- for page in doc:
104
- text += page.get_text()
105
- return text
106
-
107
-
108
- def get_top_content(self):
109
- self.fetch_papers()
110
- self.filter_top_papers()
111
- contents = {}
112
- print(f"Processing {len(self.papers)} papers:")
113
- for paper in tqdm(self.papers):
114
- paper_id = paper["paper"]['id']
115
- contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
116
- return contents
 
 
1
  import requests
 
 
 
 
 
2
 
3
  DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
4
 
5
+ def get_most_upvoted_paper():
6
+ try:
7
+ response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
8
+ response.raise_for_status()
9
+ data = response.json()
10
+ if not data:
11
+ print("No data received from API.")
12
+ return None
13
+ # Find the paper with the most upvotes
14
+ most_upvoted = max(data, key=lambda p: p.get('paper', {}).get('upvotes', 0))
15
+ return most_upvoted
16
+ except requests.RequestException as e:
17
+ print(f"Error fetching papers: {e}")
18
+ return None
19
+ except Exception as e:
20
+ print(f"Unexpected error: {e}")
21
+ return None
22
+
23
+ if __name__ == "__main__":
24
+ paper = get_most_upvoted_paper()
25
+ if paper:
26
+ print(f"Most upvoted paper: {paper['paper']['title']} (Upvotes: {paper['paper']['upvotes']})")
27
+ else:
28
+ print("No paper found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
podcasts/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ kokoro
2
+ huggingface_hub
3
+ transformers
4
+ PyMuPDF
5
+ soundfile
6
+ numpy
7
+ requests
8
+ json
run_job.py CHANGED
@@ -3,9 +3,11 @@ from app import generate_podcast_script, kmodel, kpipeline, MALE_VOICE, FEMALE_V
3
  import soundfile as sf
4
  import numpy as np
5
  import argparse
6
- from huggingface_hub import HfApi
7
  import requests
8
  import json
 
 
9
 
10
  # topics = [folder for folder in os.listdir("podcasts") if os.path.isdir(os.path.join("podcasts", folder))]
11
  podcasts = {}
@@ -103,6 +105,28 @@ def main():
103
  # 4. Save as WAV file
104
  sf.write("podcast.wav", full_audio, sr)
105
  print("Podcast audio saved as podcast.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  else:
107
  print("No audio generated.")
108
 
 
3
  import soundfile as sf
4
  import numpy as np
5
  import argparse
6
+ from huggingface_hub import HfApi, HfFolder
7
  import requests
8
  import json
9
+ from datetime import datetime
10
+ import os
11
 
12
  # topics = [folder for folder in os.listdir("podcasts") if os.path.isdir(os.path.join("podcasts", folder))]
13
  podcasts = {}
 
105
  # 4. Save as WAV file
106
  sf.write("podcast.wav", full_audio, sr)
107
  print("Podcast audio saved as podcast.wav")
108
+
109
+ # --- Upload to Hugging Face repo ---
110
+ hf_token = os.environ.get("HF_TOKEN")
111
+ if hf_token is None:
112
+ print("No Hugging Face token found in environment. Skipping upload.")
113
+ else:
114
+ api = HfApi(token=hf_token)
115
+ username = api.whoami()["name"]
116
+ repo_id = f"{username}/news-podcasts" # or your desired repo
117
+
118
+ # Create a folder by date
119
+ today = datetime.now().strftime("%Y-%m-%d")
120
+ remote_path = f"podcasts/{today}/podcast.wav"
121
+
122
+ print(f"Uploading podcast.wav to {repo_id} at {remote_path} ...")
123
+ api.upload_file(
124
+ path_or_fileobj="podcast.wav",
125
+ path_in_repo=remote_path,
126
+ repo_id=repo_id,
127
+ token=hf_token
128
+ )
129
+ print(f"Uploaded podcast.wav to {repo_id}/{remote_path}")
130
  else:
131
  print("No audio generated.")
132