Spaces:

fdaudens
/

podcast-jobs-rss-test

Sleeping

App Files Files Community

fdaudens HF Staff commited on May 13

Commit

a668eca

1 Parent(s): 547fef1

save audio file

Browse files

Files changed (3) hide show

papers.py +24 -112
podcasts/requirements.txt +8 -0
run_job.py +25 -1

papers.py CHANGED Viewed

@@ -1,116 +1,28 @@
-import os
 import requests
-import tempfile
-from datetime import datetime, timezone
-import base64
-from tqdm.auto import tqdm
-import pymupdf
 DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
-class PaperManager:
-    def __init__(self, papers_per_page=30):
-        self.papers = []
-        self.raw_papers = []  # To store fetched data
-    def calculate_rising_score(self, paper):
-        """
-        Calculate the rising score of a paper.
-        This emphasizes recent upvotes and the rate of upvote accumulation.
-        """
-        upvotes = paper.get('paper', {}).get('upvotes', 0)
-        published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
-        try:
-            published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
-        except ValueError:
-            published_time = datetime.now(timezone.utc)
-        time_diff = datetime.now(timezone.utc) - published_time
-        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
-        # Rising score favors papers that are gaining upvotes quickly
-        # Adjusted to have a linear decay over time
-        score = upvotes / (time_diff_hours + 1)
-        return score
-    def fetch_papers(self):
-        try:
-            response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
-            response.raise_for_status()
-            data = response.json()
-            if not data:
-                print("No data received from API.")
-                return False
-            self.raw_papers = data  # Store raw data
-            return True
-        except requests.RequestException as e:
-            print(f"Error fetching papers: {e}")
-            return False
-        except Exception as e:
-            print(f"Unexpected error: {e}")
-            return False
-    def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
-        self.papers = []
-        for paper in self.raw_papers:
-            paper_score = self.calculate_rising_score(paper)
-            # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
-            self.papers.append(paper)
-        self.papers = sorted(
-            self.papers,
-            key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
-            reverse=True
-        )[:2]
-        return self.papers
-    # def get_paper_content(self, paper_id):
-    #     pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
-    #     print("Processing paper:", pdf_url)
-    #     client = httpx.Client(follow_redirects=True)
-    #     response = client.get(pdf_url)
-    #     # First verification - check if we got a valid PDF response
-    #     if response.status_code != 200:
-    #         raise Exception(f"Failed to fetch PDF: {response.status_code}")
-    #     if not response.headers.get('content-type', '').startswith('application/pdf'):
-    #         raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
-    #     # Second verification - check the first few bytes of the content
-    #     if not response.content.startswith(b'%PDF'):
-    #         raise Exception("Content doesn't appear to be a valid PDF")
-    #     pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
-    #     return {"pdf": pdf_data, "url": pdf_url}
-    def get_paper_text(self, paper_id):
-        url = f"https://arxiv.org/pdf/{paper_id}.pdf"
-        response = requests.get(url)
-        if response.status_code != 200:
-            raise Exception(f"Failed to download PDF: {response.status_code}")
-        with open("temp.pdf", "wb") as f:
-            f.write(response.content)
-        with pymupdf.open("temp.pdf") as doc:
-            text = ""
-            for page in doc:
-                text += page.get_text()
-        return text
-    def get_top_content(self):
-        self.fetch_papers()
-        self.filter_top_papers()
-        contents =  {}
-        print(f"Processing {len(self.papers)} papers:")
-        for paper in tqdm(self.papers):
-            paper_id = paper["paper"]['id']
-            contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
-        return contents

 import requests
 DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
+def get_most_upvoted_paper():
+    try:
+        response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
+        response.raise_for_status()
+        data = response.json()
+        if not data:
+            print("No data received from API.")
+            return None
+        # Find the paper with the most upvotes
+        most_upvoted = max(data, key=lambda p: p.get('paper', {}).get('upvotes', 0))
+        return most_upvoted
+    except requests.RequestException as e:
+        print(f"Error fetching papers: {e}")
+        return None
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        return None
+if __name__ == "__main__":
+    paper = get_most_upvoted_paper()
+    if paper:
+        print(f"Most upvoted paper: {paper['paper']['title']} (Upvotes: {paper['paper']['upvotes']})")
+    else:
+        print("No paper found.")

podcasts/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+kokoro
+huggingface_hub
+transformers
+PyMuPDF
+soundfile
+numpy
+requests
+json

run_job.py CHANGED Viewed

@@ -3,9 +3,11 @@ from app import generate_podcast_script, kmodel, kpipeline, MALE_VOICE, FEMALE_V
 import soundfile as sf
 import numpy as np
 import argparse
-from huggingface_hub import HfApi
 import requests
 import json
 # topics = [folder for folder in os.listdir("podcasts") if os.path.isdir(os.path.join("podcasts", folder))]
 podcasts = {}
@@ -103,6 +105,28 @@ def main():
         # 4. Save as WAV file
         sf.write("podcast.wav", full_audio, sr)
         print("Podcast audio saved as podcast.wav")
     else:
         print("No audio generated.")

 import soundfile as sf
 import numpy as np
 import argparse
+from huggingface_hub import HfApi, HfFolder
 import requests
 import json
+from datetime import datetime
+import os
 # topics = [folder for folder in os.listdir("podcasts") if os.path.isdir(os.path.join("podcasts", folder))]
 podcasts = {}
         # 4. Save as WAV file
         sf.write("podcast.wav", full_audio, sr)
         print("Podcast audio saved as podcast.wav")
+        # --- Upload to Hugging Face repo ---
+        hf_token = os.environ.get("HF_TOKEN")
+        if hf_token is None:
+            print("No Hugging Face token found in environment. Skipping upload.")
+        else:
+            api = HfApi(token=hf_token)
+            username = api.whoami()["name"]
+            repo_id = f"{username}/news-podcasts"  # or your desired repo
+            # Create a folder by date
+            today = datetime.now().strftime("%Y-%m-%d")
+            remote_path = f"podcasts/{today}/podcast.wav"
+            print(f"Uploading podcast.wav to {repo_id} at {remote_path} ...")
+            api.upload_file(
+                path_or_fileobj="podcast.wav",
+                path_in_repo=remote_path,
+                repo_id=repo_id,
+                token=hf_token
+            )
+            print(f"Uploaded podcast.wav to {repo_id}/{remote_path}")
     else:
         print("No audio generated.")