jonathanagustin commited on
Commit
61c0f31
Β·
1 Parent(s): 52c89af

feat(space): integrate microsoft/VibeVoice-1.5B with in-Python download

Browse files

- Use official `vibevoice` package and from_pretrained() (no separate hf download step)
- Add minimal Gradio UI with live streaming via AudioStreamer
- Support 1–4 voice samples; normalize script lines to Speaker i
- Fallback to SDPA if flash_attn2 unavailable
- Pin lightweight requirements for Spaces

Files changed (2) hide show
  1. app.py +286 -4
  2. requirements.txt +7 -0
app.py CHANGED
@@ -1,7 +1,289 @@
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ import os
2
+ import time
3
+ import threading
4
+ from pathlib import Path
5
+ from typing import Iterator
6
+
7
  import gradio as gr
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import librosa
11
+ import torch
12
+ from transformers import set_seed
13
+
14
+ from vibevoice.modular.modeling_vibevoice_inference import (
15
+ VibeVoiceForConditionalGenerationInference,
16
+ )
17
+ from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
18
+ from vibevoice.modular.streamer import AudioStreamer
19
+
20
+ MODEL_ID = "microsoft/VibeVoice-1.5B"
21
+
22
+ def convert_to_16bit(data: np.ndarray) -> np.ndarray:
23
+ if torch.is_tensor(data):
24
+ data = data.detach().cpu().numpy()
25
+ data = np.array(data, dtype=np.float32, copy=False)
26
+ amax = np.max(np.abs(data)) if data.size else 1.0
27
+ if amax > 1.0:
28
+ data = data / amax
29
+ return (data * 32767.0).astype(np.int16)
30
+
31
+ def read_audio(path: str, target_sr: int = 24000) -> np.ndarray:
32
+ wav, sr = sf.read(path)
33
+ if wav.ndim > 1:
34
+ wav = wav.mean(axis=1)
35
+ if sr != target_sr:
36
+ wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
37
+ return wav.astype(np.float32)
38
+
39
+ class VibeMiniDemo:
40
+ def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 10):
41
+ self.model_path = model_path
42
+ self.device = device
43
+ self.inference_steps = inference_steps
44
+ self._stop = False
45
+ self._streamer = None
46
+ self._load()
47
+
48
+ def _load(self):
49
+ print(f"πŸ”„ Loading VibeVoice from {self.model_path} ...")
50
+ # Processor pulls tokenizer/config from HF automatically if model_path is a repo id
51
+ self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
52
+
53
+ # Try flash-attn2 first; fall back to SDPA if the env doesn’t have it
54
+ try:
55
+ self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
56
+ self.model_path,
57
+ torch_dtype=torch.bfloat16,
58
+ device_map="cuda" if torch.cuda.is_available() else "cpu",
59
+ attn_implementation="flash_attention_2",
60
+ )
61
+ except Exception as e:
62
+ print(f"⚠️ flash_attention_2 unavailable ({type(e).__name__}: {e}); falling back to SDPA")
63
+ self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
64
+ self.model_path,
65
+ torch_dtype=torch.bfloat16,
66
+ device_map="cuda" if torch.cuda.is_available() else "cpu",
67
+ attn_implementation="sdpa",
68
+ )
69
+ self.model.eval()
70
+
71
+ # Configure diffusion steps (matches upstream demo defaults)
72
+ self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
73
+ self.model.model.noise_scheduler.config,
74
+ algorithm_type="sde-dpmsolver++",
75
+ beta_schedule="squaredcos_cap_v2",
76
+ )
77
+ self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
78
+ print("βœ… Model ready")
79
+
80
+ def stop(self):
81
+ self._stop = True
82
+ if self._streamer is not None:
83
+ try:
84
+ self._streamer.end()
85
+ except Exception as e:
86
+ print(f"stop error: {e}")
87
+
88
+ def generate_stream(
89
+ self,
90
+ script: str,
91
+ voice_files: list[str],
92
+ cfg_scale: float = 1.3,
93
+ ) -> Iterator[tuple]:
94
+ if not script.strip():
95
+ yield None, None, "❌ Please provide a script.", gr.update(visible=False)
96
+ return
97
+
98
+ # Load voice samples (1..4)
99
+ voice_samples = [read_audio(p) for p in voice_files if p]
100
+ if not voice_samples:
101
+ yield None, None, "❌ Provide at least one voice sample (WAV/MP3/etc).", gr.update(visible=False)
102
+ return
103
+
104
+ # Normalize speaker labels if user didn’t prefix lines
105
+ lines = []
106
+ for i, raw in enumerate([ln for ln in script.splitlines() if ln.strip()]):
107
+ if raw.lower().startswith("speaker") and ":" in raw:
108
+ lines.append(raw)
109
+ else:
110
+ lines.append(f"Speaker {i % max(1, len(voice_samples))}: {raw}")
111
+ formatted = "\n".join(lines)
112
+
113
+ # Pack inputs
114
+ inputs = self.processor(
115
+ text=[formatted],
116
+ voice_samples=[voice_samples],
117
+ padding=True,
118
+ return_tensors="pt",
119
+ return_attention_mask=True,
120
+ )
121
+
122
+ self._stop = False
123
+ streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
124
+ self._streamer = streamer
125
+
126
+ # Kick off generation on a worker thread
127
+ def _worker():
128
+ try:
129
+ self.model.generate(
130
+ **inputs,
131
+ max_new_tokens=None,
132
+ cfg_scale=cfg_scale,
133
+ tokenizer=self.processor.tokenizer,
134
+ generation_config={"do_sample": False},
135
+ audio_streamer=streamer,
136
+ stop_check_fn=lambda: self._stop,
137
+ verbose=False,
138
+ refresh_negative=True,
139
+ )
140
+ except Exception as e:
141
+ print(f"gen error: {e}")
142
+ streamer.end()
143
+
144
+ t = threading.Thread(target=_worker, daemon=True)
145
+ t.start()
146
+
147
+ # Stream chunks out
148
+ sr = 24000
149
+ all_chunks, pending = [], []
150
+ last_yield = time.time()
151
+ min_chunk = sr * 30 # ~30s per push feels smooth for Spaces audio
152
+ min_interval = 15.0 # or every 15s if chunks are small
153
+
154
+ stream0 = streamer.get_stream(0)
155
+ got_any = False
156
+ yielded_any = False
157
+ chunk_idx = 0
158
+ log_prefix = f"πŸŽ™οΈ VibeVoice streaming (CFG={cfg_scale})\n"
159
+
160
+ for chunk in stream0:
161
+ if self._stop:
162
+ streamer.end()
163
+ break
164
+ got_any = True
165
+ chunk_idx += 1
166
+
167
+ if torch.is_tensor(chunk):
168
+ if chunk.dtype == torch.bfloat16:
169
+ chunk = chunk.float()
170
+ audio_np = chunk.cpu().numpy().astype(np.float32)
171
+ else:
172
+ audio_np = np.asarray(chunk, dtype=np.float32)
173
+
174
+ if audio_np.ndim > 1:
175
+ audio_np = audio_np.squeeze(-1)
176
+
177
+ pcm16 = convert_to_16bit(audio_np)
178
+ all_chunks.append(pcm16)
179
+ pending.append(pcm16)
180
+
181
+ need_push = False
182
+ if not yielded_any and sum(len(c) for c in pending) >= min_chunk:
183
+ need_push = True
184
+ yielded_any = True
185
+ elif yielded_any and (
186
+ sum(len(c) for c in pending) >= min_chunk
187
+ or (time.time() - last_yield) >= min_interval
188
+ ):
189
+ need_push = True
190
+
191
+ if need_push and pending:
192
+ new_audio = np.concatenate(pending)
193
+ total_sec = sum(len(c) for c in all_chunks) / sr
194
+ msg = log_prefix + f"🎡 {total_sec:.1f}s generated (chunk {chunk_idx})"
195
+ yield (sr, new_audio), None, msg, gr.update(visible=True)
196
+ pending, last_yield = [], time.time()
197
+
198
+ # Flush any remainder
199
+ if pending:
200
+ final = np.concatenate(pending)
201
+ total_sec = sum(len(c) for c in all_chunks) / sr
202
+ yield (sr, final), None, log_prefix + f"🎡 final chunk: {total_sec:.1f}s", gr.update(visible=True)
203
+ yielded_any = True
204
+
205
+ # Join worker quickly; then deliver full take
206
+ t.join(timeout=5.0)
207
+ self._streamer = None
208
+
209
+ if not got_any:
210
+ yield None, None, "❌ No audio chunks received from the model.", gr.update(visible=False)
211
+ return
212
+
213
+ if all_chunks:
214
+ complete = np.concatenate(all_chunks)
215
+ final_sec = len(complete) / sr
216
+ msg = f"βœ… Done. Total: {final_sec:.1f}s"
217
+ yield None, (sr, complete), msg, gr.update(visible=False)
218
+
219
+ def build_ui(demo: VibeMiniDemo):
220
+ with gr.Blocks(title="VibeVoice – Minimal") as app:
221
+ gr.Markdown("## πŸŽ™οΈ VibeVoice β€” Minimal Space\nProvide a script and 1–4 short voice samples.")
222
+
223
+ with gr.Row():
224
+ with gr.Column():
225
+ script = gr.Textbox(
226
+ label="Script",
227
+ value="Speaker 0: Welcome to VibeVoice!\nSpeaker 0: This is a minimal Space demo.",
228
+ lines=8,
229
+ )
230
+ cfg = gr.Slider(1.0, 2.0, step=0.05, value=1.3, label="CFG Scale")
231
+ voices = gr.Files(
232
+ label="Voice samples (WAV/MP3/FLAC/OGG/M4A/AAC) β€” 1 to 4 files",
233
+ file_count="multiple",
234
+ type="filepath",
235
+ )
236
+ with gr.Row():
237
+ go = gr.Button("πŸš€ Generate")
238
+ stop = gr.Button("πŸ›‘ Stop", variant="stop")
239
+
240
+ with gr.Column():
241
+ live = gr.Audio(label="Live Stream", streaming=True, autoplay=True)
242
+ full = gr.Audio(label="Complete Take (downloadable)")
243
+ log = gr.Textbox(label="Log", interactive=False)
244
+ badge = gr.HTML(visible=False, value="""
245
+ <div style="background:#dcfce7;border:1px solid #86efac;padding:8px;border-radius:8px;text-align:center">
246
+ <strong>LIVE STREAMING</strong>
247
+ </div>
248
+ """)
249
+
250
+ def on_go(script, cfg, voices):
251
+ paths = [f.name if hasattr(f, "name") else f for f in (voices or [])][:4]
252
+ # Clear outputs first
253
+ yield None, gr.update(value=None), "⏳ Starting…", gr.update(visible=True)
254
+
255
+ # Stream generation
256
+ for s_chunk, full_take, msg, badge_vis in demo.generate_stream(
257
+ script=script,
258
+ voice_files=paths,
259
+ cfg_scale=cfg,
260
+ ):
261
+ if full_take is not None:
262
+ # final: hide live, show full
263
+ yield None, full_take, msg, gr.update(visible=False)
264
+ else:
265
+ # live streaming
266
+ yield s_chunk, gr.update(), msg, badge_vis
267
+
268
+ go.click(
269
+ on_go,
270
+ inputs=[script, cfg, voices],
271
+ outputs=[live, full, log, badge],
272
+ )
273
+
274
+ def on_stop():
275
+ demo.stop()
276
+ return "πŸ›‘ Stopped.", gr.update(visible=False)
277
+
278
+ stop.click(on_stop, outputs=[log, badge])
279
+
280
+ return app
281
 
282
+ def main():
283
+ set_seed(42)
284
+ demo = VibeMiniDemo(model_path=MODEL_ID, device="cuda" if torch.cuda.is_available() else "cpu")
285
+ app = build_ui(demo)
286
+ app.queue(max_size=20, default_concurrency_limit=1).launch(server_name="0.0.0.0", show_api=False)
287
 
288
+ if __name__ == "__main__":
289
+ main()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ git+https://github.com/microsoft/VibeVoice@main
2
+ gradio
3
+ librosa
4
+ numpy
5
+ soundfile
6
+ torch
7
+ transformers