nithinraok commited on
Commit
75c1233
Β·
1 Parent(s): ea560f2

Add space with mp3 via LFS

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +427 -0
  3. data/example-yt_saTD1u8PorI.mp3 +3 -0
  4. packages.txt +2 -0
  5. requirements.txt +2 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nemo.collections.asr.models import ASRModel
2
+ import torch
3
+ import gradio as gr
4
+ import spaces
5
+ import gc
6
+ import shutil
7
+ from pathlib import Path
8
+ from pydub import AudioSegment
9
+ import numpy as np
10
+ import os
11
+ import gradio.themes as gr_themes
12
+ import csv
13
+ import datetime
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ MODEL_NAME="nvidia/parakeet-tdt-0.6b-v3"
17
+
18
+ model = ASRModel.from_pretrained(model_name=MODEL_NAME)
19
+ model.eval()
20
+
21
+
22
+ def start_session(request: gr.Request):
23
+ session_hash = request.session_hash
24
+ session_dir = Path(f'/tmp/{session_hash}')
25
+ session_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ print(f"Session with hash {session_hash} started.")
28
+ return session_dir.as_posix()
29
+
30
+ def end_session(request: gr.Request):
31
+ session_hash = request.session_hash
32
+ session_dir = Path(f'/tmp/{session_hash}')
33
+
34
+ if session_dir.exists():
35
+ shutil.rmtree(session_dir)
36
+
37
+ print(f"Session with hash {session_hash} ended.")
38
+
39
+ def get_audio_segment(audio_path, start_second, end_second):
40
+ if not audio_path or not Path(audio_path).exists():
41
+ print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
42
+ return None
43
+ try:
44
+ start_ms = int(start_second * 1000)
45
+ end_ms = int(end_second * 1000)
46
+
47
+ start_ms = max(0, start_ms)
48
+ if end_ms <= start_ms:
49
+ print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
50
+ end_ms = start_ms + 100
51
+
52
+ audio = AudioSegment.from_file(audio_path)
53
+ clipped_audio = audio[start_ms:end_ms]
54
+
55
+ samples = np.array(clipped_audio.get_array_of_samples())
56
+ if clipped_audio.channels == 2:
57
+ samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
58
+
59
+ frame_rate = clipped_audio.frame_rate
60
+ if frame_rate <= 0:
61
+ print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
62
+ frame_rate = audio.frame_rate
63
+
64
+ if samples.size == 0:
65
+ print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
66
+ return None
67
+
68
+ return (frame_rate, samples)
69
+ except FileNotFoundError:
70
+ print(f"Error: Audio file not found at path: {audio_path}")
71
+ return None
72
+ except Exception as e:
73
+ print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
74
+ return None
75
+
76
+ def format_srt_time(seconds: float) -> str:
77
+ """Converts seconds to SRT time format HH:MM:SS,mmm using datetime.timedelta"""
78
+ sanitized_total_seconds = max(0.0, seconds)
79
+ delta = datetime.timedelta(seconds=sanitized_total_seconds)
80
+ total_int_seconds = int(delta.total_seconds())
81
+
82
+ hours = total_int_seconds // 3600
83
+ remainder_seconds_after_hours = total_int_seconds % 3600
84
+ minutes = remainder_seconds_after_hours // 60
85
+ seconds_part = remainder_seconds_after_hours % 60
86
+ milliseconds = delta.microseconds // 1000
87
+
88
+ return f"{hours:02d}:{minutes:02d}:{seconds_part:02d},{milliseconds:03d}"
89
+
90
+ def generate_srt_content(segment_timestamps: list) -> str:
91
+ """Generates SRT formatted string from segment timestamps."""
92
+ srt_content = []
93
+ for i, ts in enumerate(segment_timestamps):
94
+ start_time = format_srt_time(ts['start'])
95
+ end_time = format_srt_time(ts['end'])
96
+ text = ts['segment']
97
+ srt_content.append(str(i + 1))
98
+ srt_content.append(f"{start_time} --> {end_time}")
99
+ srt_content.append(text)
100
+ srt_content.append("")
101
+ return "\n".join(srt_content)
102
+
103
+ @spaces.GPU
104
+ def get_transcripts_and_raw_times(audio_path, session_dir):
105
+ if not audio_path:
106
+ gr.Error("No audio file path provided for transcription.", duration=None)
107
+ # Return an update to hide the buttons
108
+ return [], [], None, gr.DownloadButton(label="Download Transcript (CSV)", visible=False), gr.DownloadButton(label="Download Transcript (SRT)", visible=False)
109
+
110
+ vis_data = [["N/A", "N/A", "Processing failed"]]
111
+ raw_times_data = [[0.0, 0.0]]
112
+ processed_audio_path = None
113
+ csv_file_path = None
114
+ srt_file_path = None
115
+ original_path_name = Path(audio_path).name
116
+ audio_name = Path(audio_path).stem
117
+
118
+ # Initialize button states
119
+ csv_button_update = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
120
+ srt_button_update = gr.DownloadButton(label="Download Transcript (SRT)", visible=False)
121
+
122
+ try:
123
+ try:
124
+ gr.Info(f"Loading audio: {original_path_name}", duration=2)
125
+ audio = AudioSegment.from_file(audio_path)
126
+ duration_sec = audio.duration_seconds
127
+ except Exception as load_e:
128
+ gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
129
+ return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
130
+
131
+ resampled = False
132
+ mono = False
133
+
134
+ target_sr = 16000
135
+ if audio.frame_rate != target_sr:
136
+ try:
137
+ audio = audio.set_frame_rate(target_sr)
138
+ resampled = True
139
+ except Exception as resample_e:
140
+ gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
141
+ return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
142
+
143
+ if audio.channels == 2:
144
+ try:
145
+ audio = audio.set_channels(1)
146
+ mono = True
147
+ except Exception as mono_e:
148
+ gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
149
+ return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
150
+ elif audio.channels > 2:
151
+ gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
152
+ return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
153
+
154
+ if resampled or mono:
155
+ try:
156
+ processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
157
+ audio.export(processed_audio_path, format="wav")
158
+ transcribe_path = processed_audio_path.as_posix()
159
+ info_path_name = f"{original_path_name} (processed)"
160
+ except Exception as export_e:
161
+ gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
162
+ if processed_audio_path and os.path.exists(processed_audio_path):
163
+ os.remove(processed_audio_path)
164
+ return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
165
+ else:
166
+ transcribe_path = audio_path
167
+ info_path_name = original_path_name
168
+
169
+ # Flag to track if long audio settings were applied
170
+ long_audio_settings_applied = False
171
+ try:
172
+ model.to(device)
173
+ model.to(torch.float32)
174
+ gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
175
+
176
+ # Check duration and apply specific settings for long audio
177
+ if duration_sec > 480 : # 8 minutes
178
+ try:
179
+ gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3)
180
+ print("Applying long audio settings: Local Attention and Chunking.")
181
+ model.change_attention_model("rel_pos_local_attn", [256,256])
182
+ model.change_subsampling_conv_chunking_factor(1) # 1 = auto select
183
+ long_audio_settings_applied = True
184
+ except Exception as setting_e:
185
+ gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
186
+ print(f"Warning: Failed to apply long audio settings: {setting_e}")
187
+ # Proceed without long audio settings if applying them failed
188
+
189
+ model.to(torch.bfloat16)
190
+ output = model.transcribe([transcribe_path], timestamps=True)
191
+
192
+ if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
193
+ gr.Error("Transcription failed or produced unexpected output format.", duration=None)
194
+ # Return an update to hide the buttons
195
+ return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
196
+
197
+ segment_timestamps = output[0].timestamp['segment']
198
+ csv_headers = ["Start (s)", "End (s)", "Segment"]
199
+ vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
200
+ raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
201
+
202
+ # CSV file generation
203
+ try:
204
+ csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
205
+ writer = csv.writer(open(csv_file_path, 'w'))
206
+ writer.writerow(csv_headers)
207
+ writer.writerows(vis_data)
208
+ print(f"CSV transcript saved to temporary file: {csv_file_path}")
209
+ csv_button_update = gr.DownloadButton(value=csv_file_path, visible=True, label="Download Transcript (CSV)")
210
+ except Exception as csv_e:
211
+ gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
212
+ print(f"Error writing CSV: {csv_e}")
213
+
214
+ if segment_timestamps:
215
+ try:
216
+ srt_content = generate_srt_content(segment_timestamps)
217
+ srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
218
+ with open(srt_file_path, 'w', encoding='utf-8') as f:
219
+ f.write(srt_content)
220
+ print(f"SRT transcript saved to temporary file: {srt_file_path}")
221
+ srt_button_update = gr.DownloadButton(value=srt_file_path, visible=True, label="Download Transcript (SRT)")
222
+ except Exception as srt_e:
223
+ gr.Warning(f"Failed to create transcript SRT file: {srt_e}", duration=5)
224
+ print(f"Error writing SRT: {srt_e}")
225
+
226
+ gr.Info("Transcription complete.", duration=2)
227
+ return vis_data, raw_times_data, audio_path, csv_button_update, srt_button_update
228
+
229
+ except torch.cuda.OutOfMemoryError as e:
230
+ error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
231
+ print(f"CUDA OutOfMemoryError: {e}")
232
+ gr.Error(error_msg, duration=None)
233
+ return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
234
+
235
+ except FileNotFoundError:
236
+ error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
237
+ print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
238
+ gr.Error(error_msg, duration=None)
239
+ return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
240
+
241
+ except Exception as e:
242
+ error_msg = f"Transcription failed: {e}"
243
+ print(f"Error during transcription processing: {e}")
244
+ gr.Error(error_msg, duration=None)
245
+ vis_data = [["Error", "Error", error_msg]]
246
+ raw_times_data = [[0.0, 0.0]]
247
+ return vis_data, raw_times_data, audio_path, csv_button_update, srt_button_update
248
+ finally:
249
+ # --- Model Cleanup ---
250
+ try:
251
+ # Revert settings if they were applied for long audio
252
+ if long_audio_settings_applied:
253
+ try:
254
+ print("Reverting long audio settings.")
255
+ model.change_attention_model("rel_pos")
256
+ model.change_subsampling_conv_chunking_factor(-1)
257
+ long_audio_settings_applied = False # Reset flag
258
+ except Exception as revert_e:
259
+ print(f"Warning: Failed to revert long audio settings: {revert_e}")
260
+ gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
261
+
262
+ # Original cleanup
263
+ if 'model' in locals() and hasattr(model, 'cpu'):
264
+ if device == 'cuda':
265
+ model.cpu()
266
+ gc.collect()
267
+ if device == 'cuda':
268
+ torch.cuda.empty_cache()
269
+ except Exception as cleanup_e:
270
+ print(f"Error during model cleanup: {cleanup_e}")
271
+ gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
272
+ # --- End Model Cleanup ---
273
+
274
+ finally:
275
+ if processed_audio_path and os.path.exists(processed_audio_path):
276
+ try:
277
+ os.remove(processed_audio_path)
278
+ print(f"Temporary audio file {processed_audio_path} removed.")
279
+ except Exception as e:
280
+ print(f"Error removing temporary audio file {processed_audio_path}: {e}")
281
+
282
+ def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
283
+ if not isinstance(raw_ts_list, list):
284
+ print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
285
+ return gr.Audio(value=None, label="Selected Segment")
286
+
287
+ if not current_audio_path:
288
+ print("No audio path available to play segment from.")
289
+ return gr.Audio(value=None, label="Selected Segment")
290
+
291
+ selected_index = evt.index[0]
292
+
293
+ if selected_index < 0 or selected_index >= len(raw_ts_list):
294
+ print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
295
+ return gr.Audio(value=None, label="Selected Segment")
296
+
297
+ if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
298
+ print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
299
+ return gr.Audio(value=None, label="Selected Segment")
300
+
301
+ start_time_s, end_time_s = raw_ts_list[selected_index]
302
+
303
+ print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
304
+
305
+ segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
306
+
307
+ if segment_data:
308
+ print("Segment data retrieved successfully.")
309
+ return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
310
+ else:
311
+ print("Failed to get audio segment data.")
312
+ return gr.Audio(value=None, label="Selected Segment")
313
+
314
+ article = (
315
+ "<p style='font-size: 1.1em;'>"
316
+ "This demo showcases <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3'>parakeet-tdt-0.6b-v3</a></code>, a 600-million-parameter <strong>multilingual</strong> model designed for high-quality speech recognition with automatic language detection."
317
+ "</p>"
318
+ "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
319
+ "<ul style='font-size: 1.1em;'>"
320
+ " <li>Automatic punctuation and capitalization</li>"
321
+ " <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
322
+ " <li>Multilingual transcription across 25 European languages with automatic language detection</li>"
323
+ " <li>Long audio transcription: up to 24 minutes with full attention (A100 80GB) or up to 3 hours with local attention</li>"
324
+ "</ul>"
325
+ "<p style='font-size: 1.1em;'>"
326
+ "<strong>Supported Languages:</strong> bg, hr, cs, da, nl, en, et, fi, fr, de, el, hu, it, lv, lt, mt, pl, pt, ro, sk, sl, es, sv, ru, uk"
327
+ "</p>"
328
+ "<p style='font-size: 1.1em;'>"
329
+ "This model is <strong>available for commercial and non-commercial use</strong> (CC BY 4.0)."
330
+ "</p>"
331
+ "<p style='text-align: center;'>"
332
+ "<a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3' target='_blank'>πŸŽ™οΈ Learn more about the Model</a> | "
333
+ "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>πŸ“„ Fast Conformer paper</a> | "
334
+ "<a href='https://arxiv.org/abs/2304.06795' target='_blank'>πŸ“š TDT paper</a> | "
335
+ "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» NeMo Repository</a>"
336
+ "</p>"
337
+ )
338
+
339
+ examples = [
340
+ ["data/example-yt_saTD1u8PorI.mp3"],
341
+ ]
342
+
343
+ # Define an NVIDIA-inspired theme
344
+ nvidia_theme = gr_themes.Default(
345
+ primary_hue=gr_themes.Color(
346
+ c50="#E6F1D9", # Lightest green
347
+ c100="#CEE3B3",
348
+ c200="#B5D58C",
349
+ c300="#9CC766",
350
+ c400="#84B940",
351
+ c500="#76B900", # NVIDIA Green
352
+ c600="#68A600",
353
+ c700="#5A9200",
354
+ c800="#4C7E00",
355
+ c900="#3E6A00", # Darkest green
356
+ c950="#2F5600"
357
+ ),
358
+ neutral_hue="gray", # Use gray for neutral elements
359
+ font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
360
+ ).set()
361
+
362
+ # Apply the custom theme
363
+ with gr.Blocks(theme=nvidia_theme) as demo:
364
+ model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
365
+ gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>Speech Transcription with {model_display_name}</h1>")
366
+ gr.HTML(article)
367
+
368
+ current_audio_path_state = gr.State(None)
369
+ raw_timestamps_list_state = gr.State([])
370
+
371
+ session_dir = gr.State()
372
+ demo.load(start_session, outputs=[session_dir])
373
+
374
+ with gr.Tabs():
375
+ with gr.TabItem("Audio File"):
376
+ file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
377
+ gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
378
+ file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
379
+
380
+ with gr.TabItem("Microphone"):
381
+ mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
382
+ mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
383
+
384
+ gr.Markdown("---")
385
+ gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>")
386
+
387
+ # Define the DownloadButton *before* the DataFrame
388
+ with gr.Row():
389
+ download_btn_csv = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
390
+ download_btn_srt = gr.DownloadButton(label="Download Transcript (SRT)", visible=False)
391
+
392
+ vis_timestamps_df = gr.DataFrame(
393
+ headers=["Start (s)", "End (s)", "Segment"],
394
+ datatype=["number", "number", "str"],
395
+ wrap=True,
396
+ label="Transcription Segments"
397
+ )
398
+
399
+ # selected_segment_player was defined after download_btn previously, keep it after df for layout
400
+ selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
401
+
402
+ mic_transcribe_btn.click(
403
+ fn=get_transcripts_and_raw_times,
404
+ inputs=[mic_input, session_dir],
405
+ outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt],
406
+ api_name="transcribe_mic"
407
+ )
408
+
409
+ file_transcribe_btn.click(
410
+ fn=get_transcripts_and_raw_times,
411
+ inputs=[file_input, session_dir],
412
+ outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt],
413
+ api_name="transcribe_file"
414
+ )
415
+
416
+ vis_timestamps_df.select(
417
+ fn=play_segment,
418
+ inputs=[raw_timestamps_list_state, current_audio_path_state],
419
+ outputs=[selected_segment_player],
420
+ )
421
+
422
+ demo.unload(end_session)
423
+
424
+ if __name__ == "__main__":
425
+ print("Launching Gradio Demo...")
426
+ demo.queue()
427
+ demo.launch()
data/example-yt_saTD1u8PorI.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cb340c3b868eb3695cdb06683decbff217331c2459a69394be8d3ad3b53bdf0
3
+ size 2493472
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Cython
2
+ git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]