Spaces:

fdaudens
/

podcast-jobs-rss-test

Sleeping

App Files Files Community

fdaudens HF Staff commited on May 13

Commit

547fef1

1 Parent(s): fe64cdc

first push

Browse files

Files changed (7) hide show

.DS_Store +0 -0
README.md +10 -6
app.py +144 -0
papers.py +116 -0
prompts.py +56 -0
requirements.txt +8 -0
run_job.py +110 -1

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md CHANGED Viewed

@@ -1,10 +1,14 @@
 ---
-title: Podcast Jobs
-emoji: 😻
-colorFrom: blue
-colorTo: green
-sdk: docker
-pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Open NotebookLM
+emoji: 🎙️
+colorFrom: yellow
+colorTo: red
+sdk: gradio
+sdk_version: 5.26.0
+app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: Generate a podcast to discuss the topic of your choice!
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import queue
+import threading
+import spaces
+import os
+import io
+import soundfile as sf
+import gradio as gr
+import numpy as np
+import time
+import pymupdf
+import requests
+from pathlib import Path
+import torch
+from huggingface_hub import InferenceClient
+from kokoro import KModel, KPipeline
+# -----------------------------------------------------------------------------
+# Get default podcast materials, from Daily papers and one download
+# -----------------------------------------------------------------------------
+from papers import PaperManager
+paper_manager = PaperManager()
+top_papers = paper_manager.get_top_content()
+PODCAST_SUBJECT = list(top_papers.values())[0]
+# -----------------------------------------------------------------------------
+# LLM that writes the script (unchanged)
+# -----------------------------------------------------------------------------
+from prompts import SYSTEM_PROMPT
+# client = InferenceClient(
+#     "meta-llama/Llama-3.3-70B-Instruct",
+#     provider="cerebras",
+#     token=os.getenv("HF_TOKEN"),
+# )
+client = InferenceClient(
+    "Qwen/Qwen3-32B",
+    provider="hf-inference",
+    token=os.getenv("HF_TOKEN"),
+)
+def generate_podcast_script(subject: str, steering_question: str | None = None) -> str:
+    """Ask the LLM for a script of a podcast given by two hosts."""
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"""Here is the topic: it's the top trending paper on Hugging Face daily papers today. You will need to analyze it by bringing profound insights.
+{subject[:10000]}"""},
+    ]
+    if steering_question and len(steering_question) > 0:
+        messages.append({"role": "user", "content": f"You could focus on this question: {steering_question}"})
+    response = client.chat_completion(
+        messages,
+        max_tokens=8156,
+    )
+    full_text = response.choices[0].message.content
+    assert "[JANE]" in full_text
+    dialogue_start_index = full_text.find("[JANE]")
+    podcast_text = full_text[dialogue_start_index:]
+    return podcast_text
+# -----------------------------------------------------------------------------
+# Kokoro TTS
+# -----------------------------------------------------------------------------
+CUDA_AVAILABLE = torch.cuda.is_available()
+kmodel = KModel(repo_id='hexgrad/Kokoro-82M').to("cuda" if CUDA_AVAILABLE else "cpu").eval()
+kpipeline = KPipeline(lang_code="a")  # English voices
+MALE_VOICE = "am_adam"
+FEMALE_VOICE = "af_heart"
+# Pre‑warm voices to avoid first‑call latency
+for v in (MALE_VOICE, FEMALE_VOICE):
+    kpipeline.load_voice(v)
+@spaces.GPU
+def generate_podcast(topic: str):
+    material_text = PODCAST_SUBJECT
+    # Generate podcast script!
+    podcast_script = generate_podcast_script(material_text, topic)
+    lines = [l for l in podcast_script.strip().splitlines() if l.strip()]
+    pipeline = kpipeline
+    pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
+    pipeline_voice_male = pipeline.load_voice(MALE_VOICE)
+    speed = 1.
+    sr = 24000
+    for line in lines:
+        if line.startswith("[MIKE]"):
+            pipeline_voice = pipeline_voice_male
+            voice = MALE_VOICE
+            utterance = line[len("[MIKE]"):].strip()
+        elif line.startswith("[JANE]"):
+            pipeline_voice = pipeline_voice_female
+            voice = FEMALE_VOICE
+            utterance = line[len("[JANE]"):].strip()
+        else:  # fallback
+            pipeline_voice = pipeline_voice_female
+            voice = FEMALE_VOICE
+            utterance = line
+        for _, ps, _ in pipeline(utterance, voice, speed):
+            t0 = time.time()
+            ref_s = pipeline_voice[len(ps) - 1]
+            audio_numpy = kmodel(ps, ref_s, speed).numpy()
+            yield (sr, audio_numpy)
+            t1 = time.time()
+            print(f"PROCESSED '{utterance}' in {int(t1-t0)} seconds. {audio_numpy.shape}")
+EXAMPLES = [
+    ["https://huggingface.co/blog/inference-providers-cohere", None, "How does using this compare with other inference solutions?"],
+    [None, str(Path("examples/Essay_Palantir.pdf")), "Make sure to keep some critic spirit in the analysis!"],
+]
+demo = gr.Interface(
+    title="Daily Paper Podcast 🎙️",
+    description=f"""Generates a podcast discussion between two hosts about today's top trending paper on Hugging Face: '**{list(top_papers.keys())[0]}**'
+Based on [Kokoro TTS](https://huggingface.co/hexgrad/Kokoro-82M) and [Llama-3.3-70B](meta-llama/Llama-3.3-70B-Instruct) by Cerebras.""",
+    fn=generate_podcast,
+    inputs=[
+        gr.Textbox(
+            label="🤔 Do you have a specific aspect of the paper you'd like the hosts to focus on?",
+            placeholder="You can leave this blank for a general discussion.",
+        ),
+    ],
+    outputs=[
+        gr.Audio(
+            label="Listen to your podcast! 🔊",
+            format="wav",
+            streaming=True,
+        ),
+    ],
+    theme=gr.themes.Soft(),
+    submit_btn="Generate podcast 🎙️",
+)
+if __name__ == "__main__":
+    demo.launch()

papers.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import requests
+import tempfile
+from datetime import datetime, timezone
+import base64
+from tqdm.auto import tqdm
+import pymupdf
+DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
+class PaperManager:
+    def __init__(self, papers_per_page=30):
+        self.papers = []
+        self.raw_papers = []  # To store fetched data
+    def calculate_rising_score(self, paper):
+        """
+        Calculate the rising score of a paper.
+        This emphasizes recent upvotes and the rate of upvote accumulation.
+        """
+        upvotes = paper.get('paper', {}).get('upvotes', 0)
+        published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
+        try:
+            published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
+        except ValueError:
+            published_time = datetime.now(timezone.utc)
+        time_diff = datetime.now(timezone.utc) - published_time
+        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
+        # Rising score favors papers that are gaining upvotes quickly
+        # Adjusted to have a linear decay over time
+        score = upvotes / (time_diff_hours + 1)
+        return score
+    def fetch_papers(self):
+        try:
+            response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
+            response.raise_for_status()
+            data = response.json()
+            if not data:
+                print("No data received from API.")
+                return False
+            self.raw_papers = data  # Store raw data
+            return True
+        except requests.RequestException as e:
+            print(f"Error fetching papers: {e}")
+            return False
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+            return False
+    def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
+        self.papers = []
+        for paper in self.raw_papers:
+            paper_score = self.calculate_rising_score(paper)
+            # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
+            self.papers.append(paper)
+        self.papers = sorted(
+            self.papers,
+            key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
+            reverse=True
+        )[:2]
+        return self.papers
+    # def get_paper_content(self, paper_id):
+    #     pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
+    #     print("Processing paper:", pdf_url)
+    #     client = httpx.Client(follow_redirects=True)
+    #     response = client.get(pdf_url)
+    #     # First verification - check if we got a valid PDF response
+    #     if response.status_code != 200:
+    #         raise Exception(f"Failed to fetch PDF: {response.status_code}")
+    #     if not response.headers.get('content-type', '').startswith('application/pdf'):
+    #         raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
+    #     # Second verification - check the first few bytes of the content
+    #     if not response.content.startswith(b'%PDF'):
+    #         raise Exception("Content doesn't appear to be a valid PDF")
+    #     pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
+    #     return {"pdf": pdf_data, "url": pdf_url}
+    def get_paper_text(self, paper_id):
+        url = f"https://arxiv.org/pdf/{paper_id}.pdf"
+        response = requests.get(url)
+        if response.status_code != 200:
+            raise Exception(f"Failed to download PDF: {response.status_code}")
+        with open("temp.pdf", "wb") as f:
+            f.write(response.content)
+        with pymupdf.open("temp.pdf") as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text()
+        return text
+    def get_top_content(self):
+        self.fetch_papers()
+        self.filter_top_papers()
+        contents =  {}
+        print(f"Processing {len(self.papers)} papers:")
+        for paper in tqdm(self.papers):
+            paper_id = paper["paper"]['id']
+            contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
+        return contents

prompts.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# System prompt taken from the great space by Gabriel Chua: https://huggingface.co/spaces/gabrielchua/open-notebooklm/blob/main/prompts.py
+SYSTEM_PROMPT = """
+You are a world-class podcast producer tasked with transforming the provided input text into an engaging and informative podcast script. The input may be unstructured or messy, sourced from PDFs or web pages. Your goal is to extract the most interesting and insightful content for a compelling podcast discussion.
+# Steps to Follow:
+### 1. Analyze the Input:
+Carefully examine the text, identifying key topics, points, and interesting facts or anecdotes that could drive an engaging podcast conversation. Disregard irrelevant information or formatting issues.
+DO this under the <analysis> part
+### 2. Brainstorm Ideas:
+In the <scratchpad> part, creatively brainstorm ways to present the key points engagingly. Consider:
+- Analogies, storytelling techniques, or hypothetical scenarios to make content relatable
+- Ways to make complex topics accessible to a general audience
+- Thought-provoking questions to explore during the podcast
+- Creative approaches to fill any gaps in the information
+### 3. Craft the Dialogue:
+Develop a natural, conversational flow between the two hosts named Jane and Mike. Incorporate:
+- The best ideas from your brainstorming session
+- Clear explanations of complex topics
+- An engaging and lively tone to captivate listeners. Learning should be fun!
+- A balance of information and entertainment
+Rules for the dialogue:
+- The female host (Jane) always initiates the conversation and interviews the guest
+- Include thoughtful questions from the host to guide the discussion
+- Incorporate natural speech patterns, including occasional verbal fillers (e.g., "um," "well," "you know")
+- Allow for natural interruptions and back-and-forth between host and guest
+- Ensure the guest's responses are substantiated by the input text, avoiding unsupported claims
+- Maintain a PG-rated conversation appropriate for all audiences
+- The host concludes the conversation
+**Summarize Key Insights:**
+Naturally weave a summary of key points into the closing part of the dialogue. This should feel like a casual conversation rather than a formal recap, reinforcing the main takeaways before signing off.
+**Maintain Authenticity:**
+Throughout the script, strive for authenticity in the conversation. Include:
+- Moments of genuine curiosity or surprise from the host
+- Instances where one of the hosts might briefly struggle to articulate a complex idea
+- Light-hearted moments or humor when appropriate
+**Consider Pacing and Structure:
+Ensure the dialogue has a natural ebb and flow:
+- Start with a strong hook to grab the listener's attention
+- Gradually build complexity as the conversation progresses
+- Include brief "breather" moments for listeners to absorb complex information
+- End on a high note, perhaps with a thought-provoking question or a call-to-action for listeners
+TONE: The tone of the podcast should be casual.
+DURATION: Aim for a moderate length, about 3-5 minutes.
+IMPORTANT RULE: Each line of dialogue should go in a new line [JANE] or [MIKE], as follows:
+[JANE] Hello Mike, how are you?
+[MIKE] Nice to see you again, Jane. I'm very good. Today's topic is fascinating, because...
+Remember: Each turn from a host should be on the same line.
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+kokoro
+huggingface_hub
+transformers
+PyMuPDF
+soundfile
+numpy
+requests
+json

run_job.py CHANGED Viewed

	@@ -1 +1,110 @@
1	- ~~print("Hello,~~ ~~world!")~~

+from papers import PaperManager
+from app import generate_podcast_script, kmodel, kpipeline, MALE_VOICE, FEMALE_VOICE
+import soundfile as sf
+import numpy as np
+import argparse
+from huggingface_hub import HfApi
+import requests
+import json
+# topics = [folder for folder in os.listdir("podcasts") if os.path.isdir(os.path.join("podcasts", folder))]
+podcasts = {}
+# for topic in topics:
+#     topic_path = os.path.join("podcasts", topic)
+#     podcasts[topic] = sorted([f.replace(".md", "") for f in os.listdir(topic_path) if f.endswith(".md")], reverse=True)
+def submit_job(
+    repo_id: str,
+    inference_provider: str,
+    hf_token: str
+):
+    # Configuration variables
+    username = HfApi(token=hf_token).whoami()["name"]  # Your HuggingFace username
+    space_id = "fdaudens/podcast-jobs"  # Your space ID
+    # If you want to always use the username-based repo_id, remove repo_id from parameters
+    repo_id = f"{username}/news-podcasts"
+    flavor = "cpu-basic"  # Machine type
+    # Create the API request
+    url = f"https://huggingface.co/api/jobs/{username}"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {hf_token}"
+    }
+    payload = {
+        "spaceId": space_id,
+        "command": ["python", "run_job.py"],
+        "arguments": [
+            "--provider", inference_provider,
+            "--repo-id", repo_id
+        ],
+        "environment": {
+            "HF_API_KEY": hf_token
+        },
+        "flavor": flavor
+    }
+    # Launch the job
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
+    return response.text
+def main():
+    parser = argparse.ArgumentParser(description="Podcast job runner")
+    parser.add_argument("--provider", type=str, default="hf-inference")
+    parser.add_argument("--repo-id", type=str, default="fdaudens/news-podcasts")
+    parser.add_argument("--flavor", type=str, default="t4-medium")
+    args = parser.parse_args()
+    print(f"Arguments: provider={args.provider}, repo_id={args.repo_id}, flavor={args.flavor}")
+    # 1. Get the most popular paper's content
+    paper_manager = PaperManager()
+    top_papers = paper_manager.get_top_content()
+    # Get the first (most popular) paper's text
+    subject = list(top_papers.values())[0]
+    # 2. Generate the podcast script
+    podcast_script = generate_podcast_script(subject)
+    # 3. Synthesize the podcast audio
+    lines = [l for l in podcast_script.strip().splitlines() if l.strip()]
+    sr = 24000
+    speed = 1.0
+    audio_segments = []
+    pipeline = kpipeline
+    pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
+    pipeline_voice_male = pipeline.load_voice(MALE_VOICE)
+    for line in lines:
+        if line.startswith("[MIKE]"):
+            pipeline_voice = pipeline_voice_male
+            voice = MALE_VOICE
+            utterance = line[len("[MIKE]"):].strip()
+        elif line.startswith("[JANE]"):
+            pipeline_voice = pipeline_voice_female
+            voice = FEMALE_VOICE
+            utterance = line[len("[JANE]"):].strip()
+        else:
+            pipeline_voice = pipeline_voice_female
+            voice = FEMALE_VOICE
+            utterance = line
+        for _, ps, _ in pipeline(utterance, voice, speed):
+            ref_s = pipeline_voice[len(ps) - 1]
+            audio_numpy = kmodel(ps, ref_s, speed).numpy()
+            audio_segments.append(audio_numpy)
+    # Concatenate all audio segments
+    if audio_segments:
+        full_audio = np.concatenate(audio_segments)
+        # 4. Save as WAV file
+        sf.write("podcast.wav", full_audio, sr)
+        print("Podcast audio saved as podcast.wav")
+    else:
+        print("No audio generated.")
+if __name__ == "__main__":
+    main()