yasserrmd commited on
Commit
9a86201
·
verified ·
1 Parent(s): 91388f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -27
app.py CHANGED
@@ -73,9 +73,8 @@ class VibeVoiceDemo:
73
  speaker_1: str = None, speaker_2: str = None,
74
  speaker_3: str = None, speaker_4: str = None,
75
  cfg_scale: float = 1.3):
76
- """Final audio generation only (no streaming, runs fully on GPU)."""
77
  self.is_generating = True
78
- self.stop_generation = False
79
 
80
  if not script.strip():
81
  raise gr.Error("Please provide a script.")
@@ -83,18 +82,17 @@ class VibeVoiceDemo:
83
  if num_speakers < 1 or num_speakers > 4:
84
  raise gr.Error("Number of speakers must be 1–4.")
85
 
86
- # Collect selected speakers
87
  selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
88
  for i, sp in enumerate(selected):
89
  if not sp or sp not in self.available_voices:
90
  raise gr.Error(f"Invalid speaker {i+1} selection.")
91
 
92
- # Load voices into memory
93
  voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
94
  if any(len(v) == 0 for v in voice_samples):
95
  raise gr.Error("Failed to load one or more voice samples.")
96
 
97
- # Format script
98
  lines = script.strip().split("\n")
99
  formatted = []
100
  for i, line in enumerate(lines):
@@ -108,7 +106,7 @@ class VibeVoiceDemo:
108
  formatted.append(f"Speaker {sp_id}: {line}")
109
  formatted_script = "\n".join(formatted)
110
 
111
- # Prepare processor inputs
112
  inputs = self.processor(
113
  text=[formatted_script],
114
  voice_samples=[voice_samples],
@@ -118,48 +116,39 @@ class VibeVoiceDemo:
118
  )
119
 
120
  start = time.time()
121
- sample_rate = 24000
122
- audio_streamer = AudioStreamer(batch_size=1)
123
-
124
- # Run generation fully on GPU
125
- self.model.generate(
126
  **inputs,
127
  max_new_tokens=None,
128
  cfg_scale=cfg_scale,
129
  tokenizer=self.processor.tokenizer,
130
  generation_config={'do_sample': False},
131
- audio_streamer=audio_streamer,
132
  verbose=False,
133
  )
134
 
135
- # Collect all audio chunks
136
- all_chunks = []
137
- for audio_chunk in audio_streamer.get_stream(0):
138
- if torch.is_tensor(audio_chunk):
139
- audio_chunk = audio_chunk.float().cpu().numpy()
140
- if audio_chunk.ndim > 1:
141
- audio_chunk = audio_chunk.squeeze()
142
- all_chunks.append(audio_chunk)
143
-
144
- if not all_chunks:
145
  self.is_generating = False
146
  raise gr.Error("❌ No audio was generated by the model.")
147
 
148
- complete_audio = np.concatenate(all_chunks)
149
- audio16 = convert_to_16_bit_wav(complete_audio)
 
 
150
 
151
  # Save automatically to disk
152
  os.makedirs("outputs", exist_ok=True)
153
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
154
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
155
- sf.write(file_path, complete_audio, sample_rate)
156
  print(f"💾 Saved podcast to {file_path}")
157
 
158
- total_dur = len(complete_audio) / sample_rate
159
  log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
160
 
161
  self.is_generating = False
162
- return (sample_rate, complete_audio), log
163
 
164
 
165
 
 
73
  speaker_1: str = None, speaker_2: str = None,
74
  speaker_3: str = None, speaker_4: str = None,
75
  cfg_scale: float = 1.3):
76
+ """Final audio generation only (no streaming)."""
77
  self.is_generating = True
 
78
 
79
  if not script.strip():
80
  raise gr.Error("Please provide a script.")
 
82
  if num_speakers < 1 or num_speakers > 4:
83
  raise gr.Error("Number of speakers must be 1–4.")
84
 
 
85
  selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
86
  for i, sp in enumerate(selected):
87
  if not sp or sp not in self.available_voices:
88
  raise gr.Error(f"Invalid speaker {i+1} selection.")
89
 
90
+ # load voices
91
  voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
92
  if any(len(v) == 0 for v in voice_samples):
93
  raise gr.Error("Failed to load one or more voice samples.")
94
 
95
+ # format script
96
  lines = script.strip().split("\n")
97
  formatted = []
98
  for i, line in enumerate(lines):
 
106
  formatted.append(f"Speaker {sp_id}: {line}")
107
  formatted_script = "\n".join(formatted)
108
 
109
+ # processor input
110
  inputs = self.processor(
111
  text=[formatted_script],
112
  voice_samples=[voice_samples],
 
116
  )
117
 
118
  start = time.time()
119
+ outputs = self.model.generate(
 
 
 
 
120
  **inputs,
121
  max_new_tokens=None,
122
  cfg_scale=cfg_scale,
123
  tokenizer=self.processor.tokenizer,
124
  generation_config={'do_sample': False},
 
125
  verbose=False,
126
  )
127
 
128
+ # --- FIX: pull from speech_outputs ---
129
+ if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
130
+ audio = outputs.speech_outputs[0].cpu().numpy()
131
+ else:
 
 
 
 
 
 
132
  self.is_generating = False
133
  raise gr.Error("❌ No audio was generated by the model.")
134
 
135
+ if audio.ndim > 1:
136
+ audio = audio.squeeze()
137
+
138
+ sample_rate = 24000
139
 
140
  # Save automatically to disk
141
  os.makedirs("outputs", exist_ok=True)
142
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
143
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
144
+ sf.write(file_path, audio, sample_rate)
145
  print(f"💾 Saved podcast to {file_path}")
146
 
147
+ total_dur = len(audio) / sample_rate
148
  log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
149
 
150
  self.is_generating = False
151
+ return (sample_rate, audio), log
152
 
153
 
154