Spaces:

ngxson
/

kokoro-podcast-generator

Running

App Files Files Community

ngxson HF Staff commited on Feb 17

Commit

0de6d2d

1 Parent(s): e511c8c

change to static

Browse files

Files changed (11) hide show

README.md +4 -3
app.py +0 -190
front/package-lock.json +19 -0
front/package.json +2 -1
front/src/App.tsx +3 -0
front/src/components/AuthCard.tsx +17 -0
front/src/components/PodcastGenerator.tsx +1 -1
front/src/utils/prompts.ts +1 -0
front/src/utils/utils.ts +14 -7
packages.txt +0 -2
requirements.txt +0 -1

README.md CHANGED Viewed

@@ -3,10 +3,11 @@ title: Kokoro Podcast Generator
 emoji: 🦀
 colorFrom: indigo
 colorTo: pink
-sdk: gradio
-sdk_version: 5.16.0
-app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 emoji: 🦀
 colorFrom: indigo
 colorTo: pink
+sdk: static
 pinned: false
+hf_oauth: true
+hf_oauth_scopes:
+  - inference-api
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py DELETED Viewed

@@ -1,190 +0,0 @@
-import spaces
-from kokoro import KModel, KPipeline
-import gradio as gr
-import os
-import random
-import torch
-from urllib.parse import quote
-print(os.system("""
-cd front;
-npm ci;
-npm run build;
-cd ..;
-"""))
-CHAR_LIMIT = 5000 # test
-SPACE_ID = os.environ.get('SPACE_ID')
-LLM_ENDPOINT = os.environ.get('LLM_ENDPOINT', 'null')
-CUDA_AVAILABLE = torch.cuda.is_available()
-models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
-pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
-pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
-pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
-gr.set_static_paths(paths=["./front/dist"])
-@spaces.GPU(duration=30)
-def forward_gpu(ps, ref_s, speed):
-    return models[True](ps, ref_s, speed)
-def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
-    text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
-    pipeline = pipelines[voice[0]]
-    pack = pipeline.load_voice(voice)
-    use_gpu = use_gpu and CUDA_AVAILABLE
-    for _, ps, _ in pipeline(text, voice, speed):
-        ref_s = pack[len(ps)-1]
-        try:
-            if use_gpu:
-                audio = forward_gpu(ps, ref_s, speed)
-            else:
-                audio = models[False](ps, ref_s, speed)
-        except gr.exceptions.Error as e:
-            if use_gpu:
-                gr.Warning(str(e))
-                gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.')
-                audio = models[False](ps, ref_s, speed)
-            else:
-                raise gr.Error(e)
-        return (24000, audio.numpy()), ps
-    return None, ''
-# Arena API
-def predict(text, voice='af_heart', speed=1):
-    return generate_first(text, voice, speed, use_gpu=False)[0]
-def tokenize_first(text, voice='af_heart'):
-    pipeline = pipelines[voice[0]]
-    for _, ps, _ in pipeline(text, voice):
-        return ps
-    return ''
-def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
-    text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
-    pipeline = pipelines[voice[0]]
-    pack = pipeline.load_voice(voice)
-    use_gpu = use_gpu and CUDA_AVAILABLE
-    first = True
-    for _, ps, _ in pipeline(text, voice, speed):
-        ref_s = pack[len(ps)-1]
-        try:
-            if use_gpu:
-                audio = forward_gpu(ps, ref_s, speed)
-            else:
-                audio = models[False](ps, ref_s, speed)
-        except gr.exceptions.Error as e:
-            if use_gpu:
-                gr.Warning(str(e))
-                gr.Info('Switching to CPU')
-                audio = models[False](ps, ref_s, speed)
-            else:
-                raise gr.Error(e)
-        yield 24000, audio.numpy()
-        if first:
-            first = False
-            yield 24000, torch.zeros(1).numpy()
-CHOICES = {
-'🇺🇸 🚺 Heart ❤️': 'af_heart',
-'🇺🇸 🚺 Bella 🔥': 'af_bella',
-'🇺🇸 🚺 Nicole 🎧': 'af_nicole',
-'🇺🇸 🚺 Aoede': 'af_aoede',
-'🇺🇸 🚺 Kore': 'af_kore',
-'🇺🇸 🚺 Sarah': 'af_sarah',
-'🇺🇸 🚺 Nova': 'af_nova',
-'🇺🇸 🚺 Sky': 'af_sky',
-'🇺🇸 🚺 Alloy': 'af_alloy',
-'🇺🇸 🚺 Jessica': 'af_jessica',
-'🇺🇸 🚺 River': 'af_river',
-'🇺🇸 🚹 Michael': 'am_michael',
-'🇺🇸 🚹 Fenrir': 'am_fenrir',
-'🇺🇸 🚹 Puck': 'am_puck',
-'🇺🇸 🚹 Echo': 'am_echo',
-'🇺🇸 🚹 Eric': 'am_eric',
-'🇺🇸 🚹 Liam': 'am_liam',
-'🇺🇸 🚹 Onyx': 'am_onyx',
-'🇺🇸 🚹 Santa': 'am_santa',
-'🇺🇸 🚹 Adam': 'am_adam',
-'🇬🇧 🚺 Emma': 'bf_emma',
-'🇬🇧 🚺 Isabella': 'bf_isabella',
-'🇬🇧 🚺 Alice': 'bf_alice',
-'🇬🇧 🚺 Lily': 'bf_lily',
-'🇬🇧 🚹 George': 'bm_george',
-'🇬🇧 🚹 Fable': 'bm_fable',
-'🇬🇧 🚹 Lewis': 'bm_lewis',
-'🇬🇧 🚹 Daniel': 'bm_daniel',
-}
-for v in CHOICES.values():
-    pipelines[v[0]].load_voice(v)
-TOKEN_NOTE = '''
-💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
-💬 To adjust intonation, try punctuation `;:,.!?—…"()“”` or stress `ˈ` and `ˌ`
-⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
-⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
-'''
-with gr.Blocks() as generate_tab:
-    out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
-    generate_btn = gr.Button('Generate', variant='primary')
-    with gr.Accordion('Output Tokens', open=True):
-        out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
-        tokenize_btn = gr.Button('Tokenize', variant='secondary')
-        gr.Markdown(TOKEN_NOTE)
-        predict_btn = gr.Button('Predict', variant='secondary', visible=False)
-STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
-if CHAR_LIMIT is not None:
-    STREAM_NOTE.append(f'✂️ Each stream is capped at {CHAR_LIMIT} characters.')
-    STREAM_NOTE.append('🚀 Want more characters? You can [use Kokoro directly](https://huggingface.co/hexgrad/Kokoro-82M#usage) or duplicate this space:')
-STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
-with gr.Blocks() as stream_tab:
-    out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
-    with gr.Row():
-        stream_btn = gr.Button('Stream', variant='primary')
-        stop_btn = gr.Button('Stop', variant='stop')
-    with gr.Accordion('Note', open=True):
-        gr.Markdown(STREAM_NOTE)
-        gr.DuplicateButton()
-API_NAME = 'tts'
-head = f'''
-<script>
-    document.addEventListener('DOMContentLoaded', () => {{
-        console.log('DOM content loaded');
-        if (!localStorage.getItem('debug') && !window.location.href.match(/debug=1/)) {{
-            console.log('Attaching frontend app');
-            const frontendApp = document.createElement('iframe');
-            frontendApp.src = '/gradio_api/file=./front/dist/index.html?SPACE_ID={quote(SPACE_ID)}&LLM_ENDPOINT={quote(LLM_ENDPOINT)}';
-            frontendApp.style = 'position: fixed; top: 0; left: 0; width: 100%; height: 100%; border: none; z-index: 999999;';
-            document.body.appendChild(frontendApp);
-        }}
-    }});
-</script>
-'''
-with gr.Blocks(head=head) as app:
-    with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
-            voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Quality and availability vary by language')
-            speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
-        with gr.Column():
-            gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
-    generate_btn.click(fn=generate_first, inputs=[text, voice, speed], outputs=[out_audio, out_ps], api_name=API_NAME)
-    tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps], api_name=API_NAME)
-    stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed], outputs=[out_stream], api_name=API_NAME)
-    stop_btn.click(fn=None, cancels=stream_event)
-    predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
-if __name__ == '__main__':
-    app.queue(api_open=True).launch(show_api=True, ssr_mode=True)

front/package-lock.json CHANGED Viewed

@@ -9,6 +9,7 @@
       "version": "0.0.0",
       "dependencies": {
         "@gradio/client": "^1.12.0",
         "@sec-ant/readable-stream": "^0.6.0",
         "autoprefixer": "^10.4.20",
         "base64-arraybuffer": "^1.0.2",
@@ -954,6 +955,24 @@
         "node": ">=18.0.0"
       }
     },
     "node_modules/@humanfs/core": {
       "version": "0.19.1",
       "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",

       "version": "0.0.0",
       "dependencies": {
         "@gradio/client": "^1.12.0",
+        "@huggingface/hub": "^1.0.1",
         "@sec-ant/readable-stream": "^0.6.0",
         "autoprefixer": "^10.4.20",
         "base64-arraybuffer": "^1.0.2",
         "node": ">=18.0.0"
       }
     },
+    "node_modules/@huggingface/hub": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/@huggingface/hub/-/hub-1.0.1.tgz",
+      "integrity": "sha512-wogGVETaNUV/wYBkny0uQD48L0rK9cttVtbaA1Rw/pGCuSYoZ8YlvTV6zymsGJfXaxQU8zup0aOR2XLIf6HVfg==",
+      "license": "MIT",
+      "dependencies": {
+        "@huggingface/tasks": "^0.15.9"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@huggingface/tasks": {
+      "version": "0.15.9",
+      "resolved": "https://registry.npmjs.org/@huggingface/tasks/-/tasks-0.15.9.tgz",
+      "integrity": "sha512-cbnZcpMHKdhURWIplVP4obHxAZcxjyRm0zI7peTPksZN4CtIOMmJC4ZqGEymo0lk+0VNkXD7ULwFJ3JjT/VpkQ==",
+      "license": "MIT"
+    },
     "node_modules/@humanfs/core": {
       "version": "0.19.1",
       "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",

front/package.json CHANGED Viewed

@@ -5,13 +5,14 @@
   "type": "module",
   "scripts": {
     "dev": "vite",
-    "build": "tsc -b && vite build",
     "lint": "eslint .",
     "format": "npm run lint && prettier --write .",
     "preview": "vite preview"
   },
   "dependencies": {
     "@gradio/client": "^1.12.0",
     "@sec-ant/readable-stream": "^0.6.0",
     "autoprefixer": "^10.4.20",
     "base64-arraybuffer": "^1.0.2",

   "type": "module",
   "scripts": {
     "dev": "vite",
+    "build": "tsc -b && vite build && cp ./dist/index.html ../index.html",
     "lint": "eslint .",
     "format": "npm run lint && prettier --write .",
     "preview": "vite preview"
   },
   "dependencies": {
     "@gradio/client": "^1.12.0",
+    "@huggingface/hub": "^1.0.1",
     "@sec-ant/readable-stream": "^0.6.0",
     "autoprefixer": "^10.4.20",
     "base64-arraybuffer": "^1.0.2",

front/src/App.tsx CHANGED Viewed

@@ -2,6 +2,7 @@ import { OpenInNewTab } from './utils/common';
 import { PodcastGenerator } from './components/PodcastGenerator';
 import { useState } from 'react';
 import { ScriptMaker } from './components/ScriptMaker';
 function App() {
   const [genratedScript, setGeneratedScript] = useState<string>('');
@@ -20,6 +21,8 @@ function App() {
           </p>
         </div>
         <ScriptMaker
           setScript={setGeneratedScript}
           setBusy={setBusy}

 import { PodcastGenerator } from './components/PodcastGenerator';
 import { useState } from 'react';
 import { ScriptMaker } from './components/ScriptMaker';
+import { AuthCard } from './components/AuthCard';
 function App() {
   const [genratedScript, setGeneratedScript] = useState<string>('');
           </p>
         </div>
+        <AuthCard />
         <ScriptMaker
           setScript={setGeneratedScript}
           setBusy={setBusy}

front/src/components/AuthCard.tsx ADDED Viewed

	@@ -0,0 +1,17 @@

+import { oauthLoginUrl, oauthHandleRedirectIfPresent } from "@huggingface/hub";
+const login = async () => {
+  const url = await oauthLoginUrl();
+  window.location.href = url;
+}
+export const AuthCard = () => {
+  return <div className="card bg-base-100 w-full shadow-xl">
+      <div className="card-body">
+        <h2 className="card-title">Step 0: Sign in to use Inference Providers</h2>
+        <div>
+          <button className="btn btn-primary" onClick={login}>🤗 Sign in with Hugging Face</button>
+        </div>
+    </div>
+  </div>
+}

front/src/components/PodcastGenerator.tsx CHANGED Viewed

@@ -132,7 +132,7 @@ export const PodcastGenerator = ({
         if (i === 0) {
           outputWav = step.audioBuffer;
           const openingSound = await loadWavAndDecode(openingSoundSrc);
-          outputWav = joinAudio(openingSound, outputWav!, -1);
         } else {
           const lastStep = steps[i - 1];
           outputWav = joinAudio(

         if (i === 0) {
           outputWav = step.audioBuffer;
           const openingSound = await loadWavAndDecode(openingSoundSrc);
+          outputWav = joinAudio(openingSound, outputWav!, -2);
         } else {
           const lastStep = steps[i - 1];
           outputWav = joinAudio(

front/src/utils/prompts.ts CHANGED Viewed

@@ -16,6 +16,7 @@ Some rules:
 - First turns should be the introduction for the theme and speakers.
 - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
 - Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
 There is an example (it is truncated):

 - First turns should be the introduction for the theme and speakers.
 - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
 - Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
+- In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
 There is an example (it is truncated):

front/src/utils/utils.ts CHANGED Viewed

@@ -112,7 +112,8 @@ export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
 export const joinAudio = (
   audio1: AudioBuffer,
   audio2: AudioBuffer,
-  gapSeconds: number
 ): AudioBuffer => {
   const sampleRate = audio1.sampleRate;
   const numChannels = audio1.numberOfChannels;
@@ -175,12 +176,18 @@ export const joinAudio = (
       offset += nonOverlapLength;
       // Blend overlapping region.
-      for (let i = 0; i < effectiveOverlap; i++) {
-        // Linear crossfade:
-        const fadeOut = 1 - i / effectiveOverlap;
-        const fadeIn = i / effectiveOverlap;
-        outputData[offset + i] =
-          data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
       }
       offset += effectiveOverlap;

 export const joinAudio = (
   audio1: AudioBuffer,
   audio2: AudioBuffer,
+  gapSeconds: number,
+  overlap: 'none' | 'cross-fade' = 'none'
 ): AudioBuffer => {
   const sampleRate = audio1.sampleRate;
   const numChannels = audio1.numberOfChannels;
       offset += nonOverlapLength;
       // Blend overlapping region.
+      if (overlap === 'cross-fade') {
+        for (let i = 0; i < effectiveOverlap; i++) {
+          // Linear crossfade:
+          const fadeOut = 1 - i / effectiveOverlap;
+          const fadeIn = i / effectiveOverlap;
+          outputData[offset + i] =
+            data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
+        }
+      } else {
+        for (let i = 0; i < effectiveOverlap; i++) {
+          outputData[offset + i] = data1[nonOverlapLength + i] + data2[i];
+        }
       }
       offset += effectiveOverlap;

packages.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- espeak-ng
2	- nodejs

requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- kokoro>=0.7.16