ngxson HF staff commited on
Commit
0de6d2d
·
1 Parent(s): e511c8c

change to static

Browse files
README.md CHANGED
@@ -3,10 +3,11 @@ title: Kokoro Podcast Generator
3
  emoji: 🦀
4
  colorFrom: indigo
5
  colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.16.0
8
- app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
3
  emoji: 🦀
4
  colorFrom: indigo
5
  colorTo: pink
6
+ sdk: static
 
 
7
  pinned: false
8
+ hf_oauth: true
9
+ hf_oauth_scopes:
10
+ - inference-api
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py DELETED
@@ -1,190 +0,0 @@
1
- import spaces
2
- from kokoro import KModel, KPipeline
3
- import gradio as gr
4
- import os
5
- import random
6
- import torch
7
- from urllib.parse import quote
8
-
9
- print(os.system("""
10
- cd front;
11
- npm ci;
12
- npm run build;
13
- cd ..;
14
- """))
15
-
16
- CHAR_LIMIT = 5000 # test
17
-
18
- SPACE_ID = os.environ.get('SPACE_ID')
19
- LLM_ENDPOINT = os.environ.get('LLM_ENDPOINT', 'null')
20
-
21
- CUDA_AVAILABLE = torch.cuda.is_available()
22
- models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
23
- pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
24
- pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
25
- pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
26
-
27
- gr.set_static_paths(paths=["./front/dist"])
28
-
29
- @spaces.GPU(duration=30)
30
- def forward_gpu(ps, ref_s, speed):
31
- return models[True](ps, ref_s, speed)
32
-
33
- def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
34
- text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
35
- pipeline = pipelines[voice[0]]
36
- pack = pipeline.load_voice(voice)
37
- use_gpu = use_gpu and CUDA_AVAILABLE
38
- for _, ps, _ in pipeline(text, voice, speed):
39
- ref_s = pack[len(ps)-1]
40
- try:
41
- if use_gpu:
42
- audio = forward_gpu(ps, ref_s, speed)
43
- else:
44
- audio = models[False](ps, ref_s, speed)
45
- except gr.exceptions.Error as e:
46
- if use_gpu:
47
- gr.Warning(str(e))
48
- gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.')
49
- audio = models[False](ps, ref_s, speed)
50
- else:
51
- raise gr.Error(e)
52
- return (24000, audio.numpy()), ps
53
- return None, ''
54
-
55
- # Arena API
56
- def predict(text, voice='af_heart', speed=1):
57
- return generate_first(text, voice, speed, use_gpu=False)[0]
58
-
59
- def tokenize_first(text, voice='af_heart'):
60
- pipeline = pipelines[voice[0]]
61
- for _, ps, _ in pipeline(text, voice):
62
- return ps
63
- return ''
64
-
65
- def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
66
- text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
67
- pipeline = pipelines[voice[0]]
68
- pack = pipeline.load_voice(voice)
69
- use_gpu = use_gpu and CUDA_AVAILABLE
70
- first = True
71
- for _, ps, _ in pipeline(text, voice, speed):
72
- ref_s = pack[len(ps)-1]
73
- try:
74
- if use_gpu:
75
- audio = forward_gpu(ps, ref_s, speed)
76
- else:
77
- audio = models[False](ps, ref_s, speed)
78
- except gr.exceptions.Error as e:
79
- if use_gpu:
80
- gr.Warning(str(e))
81
- gr.Info('Switching to CPU')
82
- audio = models[False](ps, ref_s, speed)
83
- else:
84
- raise gr.Error(e)
85
- yield 24000, audio.numpy()
86
- if first:
87
- first = False
88
- yield 24000, torch.zeros(1).numpy()
89
-
90
- CHOICES = {
91
- '🇺🇸 🚺 Heart ❤️': 'af_heart',
92
- '🇺🇸 🚺 Bella 🔥': 'af_bella',
93
- '🇺🇸 🚺 Nicole 🎧': 'af_nicole',
94
- '🇺🇸 🚺 Aoede': 'af_aoede',
95
- '🇺🇸 🚺 Kore': 'af_kore',
96
- '🇺🇸 🚺 Sarah': 'af_sarah',
97
- '🇺🇸 🚺 Nova': 'af_nova',
98
- '🇺🇸 🚺 Sky': 'af_sky',
99
- '🇺🇸 🚺 Alloy': 'af_alloy',
100
- '🇺🇸 🚺 Jessica': 'af_jessica',
101
- '🇺🇸 🚺 River': 'af_river',
102
- '🇺🇸 🚹 Michael': 'am_michael',
103
- '🇺🇸 🚹 Fenrir': 'am_fenrir',
104
- '🇺🇸 🚹 Puck': 'am_puck',
105
- '🇺🇸 🚹 Echo': 'am_echo',
106
- '🇺🇸 🚹 Eric': 'am_eric',
107
- '🇺🇸 🚹 Liam': 'am_liam',
108
- '🇺🇸 🚹 Onyx': 'am_onyx',
109
- '🇺🇸 🚹 Santa': 'am_santa',
110
- '🇺🇸 🚹 Adam': 'am_adam',
111
- '🇬🇧 🚺 Emma': 'bf_emma',
112
- '🇬🇧 🚺 Isabella': 'bf_isabella',
113
- '🇬🇧 🚺 Alice': 'bf_alice',
114
- '🇬🇧 🚺 Lily': 'bf_lily',
115
- '🇬🇧 🚹 George': 'bm_george',
116
- '🇬🇧 🚹 Fable': 'bm_fable',
117
- '🇬🇧 🚹 Lewis': 'bm_lewis',
118
- '🇬🇧 🚹 Daniel': 'bm_daniel',
119
- }
120
- for v in CHOICES.values():
121
- pipelines[v[0]].load_voice(v)
122
-
123
- TOKEN_NOTE = '''
124
- 💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
125
-
126
- 💬 To adjust intonation, try punctuation `;:,.!?—…"()“”` or stress `ˈ` and `ˌ`
127
-
128
- ⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
129
-
130
- ⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
131
- '''
132
-
133
- with gr.Blocks() as generate_tab:
134
- out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
135
- generate_btn = gr.Button('Generate', variant='primary')
136
- with gr.Accordion('Output Tokens', open=True):
137
- out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
138
- tokenize_btn = gr.Button('Tokenize', variant='secondary')
139
- gr.Markdown(TOKEN_NOTE)
140
- predict_btn = gr.Button('Predict', variant='secondary', visible=False)
141
-
142
- STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
143
- if CHAR_LIMIT is not None:
144
- STREAM_NOTE.append(f'✂️ Each stream is capped at {CHAR_LIMIT} characters.')
145
- STREAM_NOTE.append('🚀 Want more characters? You can [use Kokoro directly](https://huggingface.co/hexgrad/Kokoro-82M#usage) or duplicate this space:')
146
- STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
147
-
148
- with gr.Blocks() as stream_tab:
149
- out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
150
- with gr.Row():
151
- stream_btn = gr.Button('Stream', variant='primary')
152
- stop_btn = gr.Button('Stop', variant='stop')
153
- with gr.Accordion('Note', open=True):
154
- gr.Markdown(STREAM_NOTE)
155
- gr.DuplicateButton()
156
-
157
- API_NAME = 'tts'
158
-
159
-
160
- head = f'''
161
- <script>
162
- document.addEventListener('DOMContentLoaded', () => {{
163
- console.log('DOM content loaded');
164
- if (!localStorage.getItem('debug') && !window.location.href.match(/debug=1/)) {{
165
- console.log('Attaching frontend app');
166
- const frontendApp = document.createElement('iframe');
167
- frontendApp.src = '/gradio_api/file=./front/dist/index.html?SPACE_ID={quote(SPACE_ID)}&LLM_ENDPOINT={quote(LLM_ENDPOINT)}';
168
- frontendApp.style = 'position: fixed; top: 0; left: 0; width: 100%; height: 100%; border: none; z-index: 999999;';
169
- document.body.appendChild(frontendApp);
170
- }}
171
- }});
172
- </script>
173
- '''
174
-
175
- with gr.Blocks(head=head) as app:
176
- with gr.Row():
177
- with gr.Column():
178
- text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
179
- voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Quality and availability vary by language')
180
- speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
181
- with gr.Column():
182
- gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
183
- generate_btn.click(fn=generate_first, inputs=[text, voice, speed], outputs=[out_audio, out_ps], api_name=API_NAME)
184
- tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps], api_name=API_NAME)
185
- stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed], outputs=[out_stream], api_name=API_NAME)
186
- stop_btn.click(fn=None, cancels=stream_event)
187
- predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
188
-
189
- if __name__ == '__main__':
190
- app.queue(api_open=True).launch(show_api=True, ssr_mode=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
front/package-lock.json CHANGED
@@ -9,6 +9,7 @@
9
  "version": "0.0.0",
10
  "dependencies": {
11
  "@gradio/client": "^1.12.0",
 
12
  "@sec-ant/readable-stream": "^0.6.0",
13
  "autoprefixer": "^10.4.20",
14
  "base64-arraybuffer": "^1.0.2",
@@ -954,6 +955,24 @@
954
  "node": ">=18.0.0"
955
  }
956
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
957
  "node_modules/@humanfs/core": {
958
  "version": "0.19.1",
959
  "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
 
9
  "version": "0.0.0",
10
  "dependencies": {
11
  "@gradio/client": "^1.12.0",
12
+ "@huggingface/hub": "^1.0.1",
13
  "@sec-ant/readable-stream": "^0.6.0",
14
  "autoprefixer": "^10.4.20",
15
  "base64-arraybuffer": "^1.0.2",
 
955
  "node": ">=18.0.0"
956
  }
957
  },
958
+ "node_modules/@huggingface/hub": {
959
+ "version": "1.0.1",
960
+ "resolved": "https://registry.npmjs.org/@huggingface/hub/-/hub-1.0.1.tgz",
961
+ "integrity": "sha512-wogGVETaNUV/wYBkny0uQD48L0rK9cttVtbaA1Rw/pGCuSYoZ8YlvTV6zymsGJfXaxQU8zup0aOR2XLIf6HVfg==",
962
+ "license": "MIT",
963
+ "dependencies": {
964
+ "@huggingface/tasks": "^0.15.9"
965
+ },
966
+ "engines": {
967
+ "node": ">=18"
968
+ }
969
+ },
970
+ "node_modules/@huggingface/tasks": {
971
+ "version": "0.15.9",
972
+ "resolved": "https://registry.npmjs.org/@huggingface/tasks/-/tasks-0.15.9.tgz",
973
+ "integrity": "sha512-cbnZcpMHKdhURWIplVP4obHxAZcxjyRm0zI7peTPksZN4CtIOMmJC4ZqGEymo0lk+0VNkXD7ULwFJ3JjT/VpkQ==",
974
+ "license": "MIT"
975
+ },
976
  "node_modules/@humanfs/core": {
977
  "version": "0.19.1",
978
  "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
front/package.json CHANGED
@@ -5,13 +5,14 @@
5
  "type": "module",
6
  "scripts": {
7
  "dev": "vite",
8
- "build": "tsc -b && vite build",
9
  "lint": "eslint .",
10
  "format": "npm run lint && prettier --write .",
11
  "preview": "vite preview"
12
  },
13
  "dependencies": {
14
  "@gradio/client": "^1.12.0",
 
15
  "@sec-ant/readable-stream": "^0.6.0",
16
  "autoprefixer": "^10.4.20",
17
  "base64-arraybuffer": "^1.0.2",
 
5
  "type": "module",
6
  "scripts": {
7
  "dev": "vite",
8
+ "build": "tsc -b && vite build && cp ./dist/index.html ../index.html",
9
  "lint": "eslint .",
10
  "format": "npm run lint && prettier --write .",
11
  "preview": "vite preview"
12
  },
13
  "dependencies": {
14
  "@gradio/client": "^1.12.0",
15
+ "@huggingface/hub": "^1.0.1",
16
  "@sec-ant/readable-stream": "^0.6.0",
17
  "autoprefixer": "^10.4.20",
18
  "base64-arraybuffer": "^1.0.2",
front/src/App.tsx CHANGED
@@ -2,6 +2,7 @@ import { OpenInNewTab } from './utils/common';
2
  import { PodcastGenerator } from './components/PodcastGenerator';
3
  import { useState } from 'react';
4
  import { ScriptMaker } from './components/ScriptMaker';
 
5
 
6
  function App() {
7
  const [genratedScript, setGeneratedScript] = useState<string>('');
@@ -20,6 +21,8 @@ function App() {
20
  </p>
21
  </div>
22
 
 
 
23
  <ScriptMaker
24
  setScript={setGeneratedScript}
25
  setBusy={setBusy}
 
2
  import { PodcastGenerator } from './components/PodcastGenerator';
3
  import { useState } from 'react';
4
  import { ScriptMaker } from './components/ScriptMaker';
5
+ import { AuthCard } from './components/AuthCard';
6
 
7
  function App() {
8
  const [genratedScript, setGeneratedScript] = useState<string>('');
 
21
  </p>
22
  </div>
23
 
24
+ <AuthCard />
25
+
26
  <ScriptMaker
27
  setScript={setGeneratedScript}
28
  setBusy={setBusy}
front/src/components/AuthCard.tsx ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { oauthLoginUrl, oauthHandleRedirectIfPresent } from "@huggingface/hub";
2
+
3
+ const login = async () => {
4
+ const url = await oauthLoginUrl();
5
+ window.location.href = url;
6
+ }
7
+
8
+ export const AuthCard = () => {
9
+ return <div className="card bg-base-100 w-full shadow-xl">
10
+ <div className="card-body">
11
+ <h2 className="card-title">Step 0: Sign in to use Inference Providers</h2>
12
+ <div>
13
+ <button className="btn btn-primary" onClick={login}>🤗 Sign in with Hugging Face</button>
14
+ </div>
15
+ </div>
16
+ </div>
17
+ }
front/src/components/PodcastGenerator.tsx CHANGED
@@ -132,7 +132,7 @@ export const PodcastGenerator = ({
132
  if (i === 0) {
133
  outputWav = step.audioBuffer;
134
  const openingSound = await loadWavAndDecode(openingSoundSrc);
135
- outputWav = joinAudio(openingSound, outputWav!, -1);
136
  } else {
137
  const lastStep = steps[i - 1];
138
  outputWav = joinAudio(
 
132
  if (i === 0) {
133
  outputWav = step.audioBuffer;
134
  const openingSound = await loadWavAndDecode(openingSoundSrc);
135
+ outputWav = joinAudio(openingSound, outputWav!, -2);
136
  } else {
137
  const lastStep = steps[i - 1];
138
  outputWav = joinAudio(
front/src/utils/prompts.ts CHANGED
@@ -16,6 +16,7 @@ Some rules:
16
  - First turns should be the introduction for the theme and speakers.
17
  - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
18
  - Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
 
19
 
20
  There is an example (it is truncated):
21
 
 
16
  - First turns should be the introduction for the theme and speakers.
17
  - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
18
  - Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
19
+ - In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
20
 
21
  There is an example (it is truncated):
22
 
front/src/utils/utils.ts CHANGED
@@ -112,7 +112,8 @@ export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
112
  export const joinAudio = (
113
  audio1: AudioBuffer,
114
  audio2: AudioBuffer,
115
- gapSeconds: number
 
116
  ): AudioBuffer => {
117
  const sampleRate = audio1.sampleRate;
118
  const numChannels = audio1.numberOfChannels;
@@ -175,12 +176,18 @@ export const joinAudio = (
175
  offset += nonOverlapLength;
176
 
177
  // Blend overlapping region.
178
- for (let i = 0; i < effectiveOverlap; i++) {
179
- // Linear crossfade:
180
- const fadeOut = 1 - i / effectiveOverlap;
181
- const fadeIn = i / effectiveOverlap;
182
- outputData[offset + i] =
183
- data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
 
 
 
 
 
 
184
  }
185
  offset += effectiveOverlap;
186
 
 
112
  export const joinAudio = (
113
  audio1: AudioBuffer,
114
  audio2: AudioBuffer,
115
+ gapSeconds: number,
116
+ overlap: 'none' | 'cross-fade' = 'none'
117
  ): AudioBuffer => {
118
  const sampleRate = audio1.sampleRate;
119
  const numChannels = audio1.numberOfChannels;
 
176
  offset += nonOverlapLength;
177
 
178
  // Blend overlapping region.
179
+ if (overlap === 'cross-fade') {
180
+ for (let i = 0; i < effectiveOverlap; i++) {
181
+ // Linear crossfade:
182
+ const fadeOut = 1 - i / effectiveOverlap;
183
+ const fadeIn = i / effectiveOverlap;
184
+ outputData[offset + i] =
185
+ data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
186
+ }
187
+ } else {
188
+ for (let i = 0; i < effectiveOverlap; i++) {
189
+ outputData[offset + i] = data1[nonOverlapLength + i] + data2[i];
190
+ }
191
  }
192
  offset += effectiveOverlap;
193
 
packages.txt DELETED
@@ -1,2 +0,0 @@
1
- espeak-ng
2
- nodejs
 
 
 
requirements.txt DELETED
@@ -1 +0,0 @@
1
- kokoro>=0.7.16