yama commited on
Commit
bef704a
·
1 Parent(s): f742df6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -130
app.py CHANGED
@@ -29,6 +29,7 @@ import psutil
29
 
30
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
31
  source_languages = {
 
32
  "en": "English",
33
  # "zh": "Chinese",
34
  # "de": "German",
@@ -36,7 +37,6 @@ source_languages = {
36
  # "ru": "Russian",
37
  # "ko": "Korean",
38
  # "fr": "French",
39
- "ja": "Japanese",
40
  # "pt": "Portuguese",
41
  # "tr": "Turkish",
42
  # "pl": "Polish",
@@ -150,78 +150,83 @@ embedding_model = PretrainedSpeakerEmbedding(
150
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
151
 
152
 
153
- def transcribe(microphone, file_upload):
154
- warn_output = ""
155
- if (microphone is not None) and (file_upload is not None):
156
- warn_output = (
157
- "WARNING: You've uploaded an audio file and used the microphone. "
158
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
159
- )
160
-
161
- elif (microphone is None) and (file_upload is None):
162
- return "ERROR: You have to either use the microphone or upload an audio file"
163
-
164
- file = microphone if microphone is not None else file_upload
165
-
166
- text = pipe(file)["text"]
167
-
168
- return warn_output + text
169
-
170
-
171
- def _return_yt_html_embed(yt_url):
172
- video_id = yt_url.split("?v=")[-1]
173
- HTML_str = (
174
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
175
- " </center>"
176
- )
177
- return HTML_str
178
-
179
-
180
- def yt_transcribe(yt_url):
181
- # yt = YouTube(yt_url)
182
- # html_embed_str = _return_yt_html_embed(yt_url)
183
- # stream = yt.streams.filter(only_audio=True)[0]
184
- # stream.download(filename="audio.mp3")
185
-
186
- ydl_opts = {
187
- 'format': 'bestvideo*+bestaudio/best',
188
- 'postprocessors': [{
189
- 'key': 'FFmpegExtractAudio',
190
- 'preferredcodec': 'mp3',
191
- 'preferredquality': '192',
192
- }],
193
- 'outtmpl': 'audio.%(ext)s',
194
- }
195
-
196
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
197
- ydl.download([yt_url])
198
-
199
- text = pipe("audio.mp3")["text"]
200
- return html_embed_str, text
201
-
202
-
 
 
 
 
203
  def convert_time(secs):
204
  return datetime.timedelta(seconds=round(secs))
205
 
206
 
207
- def get_youtube(video_url):
208
- # yt = YouTube(video_url)
209
- # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
210
-
211
- ydl_opts = {
212
- 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
213
- }
214
-
215
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
216
- info = ydl.extract_info(video_url, download=False)
217
- abs_video_path = ydl.prepare_filename(info)
218
- ydl.process_info(info)
219
-
220
- print("Success download video")
221
- print(abs_video_path)
222
- return abs_video_path
223
-
224
-
 
225
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
226
  """
227
  # Transcribe youtube link using OpenAI Whisper
@@ -374,67 +379,107 @@ demo = gr.Blocks(title=title)
374
  demo.encrypt = False
375
 
376
  with demo:
377
- # gr.Markdown('''
378
- # <div>
379
- # <h1 style='text-align: center'>Whisper speaker diarization</h1>
380
- # This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
381
- # and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
382
- # </div>
383
- # ''')
384
- #
385
- # with gr.Row():
386
- # gr.Markdown('''
387
- # ### Transcribe youtube link using OpenAI Whisper
388
- # ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
389
- # ##### 2. Generating speaker embeddings for each segments.
390
- # ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
391
- # ''')
392
-
393
- with gr.Row():
394
- gr.Markdown('''
395
- ### You can test by following examples:
396
- ''')
397
- examples = gr.Examples(examples=
398
- ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
399
- "https://www.youtube.com/watch?v=-UX0X45sYe4",
400
- "https://www.youtube.com/watch?v=7minSgqi-Gw"],
401
- label="Examples", inputs=[youtube_url_in])
402
-
403
- with gr.Row():
404
- with gr.Column():
405
- youtube_url_in.render()
406
- download_youtube_btn = gr.Button("Download Youtube video")
407
- download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
408
- print(video_in)
409
-
410
- with gr.Row():
411
- with gr.Column():
412
- video_in.render()
413
  with gr.Column():
414
- gr.Markdown('''
415
- ##### Here you can start the transcription process.
416
- ##### Please select the source language for transcription.
417
- ##### You can select a range of assumed numbers of speakers.
418
- ''')
419
- selected_source_lang.render()
420
- selected_whisper_model.render()
421
- number_speakers.render()
422
- transcribe_btn = gr.Button("Transcribe audio and diarization")
423
- transcribe_btn.click(speech_to_text,
424
- [video_in, selected_source_lang, selected_whisper_model, number_speakers],
425
- [transcription_df, system_info, download_transcript]
426
- )
427
-
428
- with gr.Row():
429
- gr.Markdown('''
430
- ##### Here you will get transcription output
431
- ##### ''')
432
-
433
- with gr.Row():
434
- with gr.Column():
435
- download_transcript.render()
436
- transcription_df.render()
437
- # system_info.render()
438
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
 
440
  demo.launch(debug=True)
 
29
 
30
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
31
  source_languages = {
32
+ "ja": "Japanese",
33
  "en": "English",
34
  # "zh": "Chinese",
35
  # "de": "German",
 
37
  # "ru": "Russian",
38
  # "ko": "Korean",
39
  # "fr": "French",
 
40
  # "pt": "Portuguese",
41
  # "tr": "Turkish",
42
  # "pl": "Polish",
 
150
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
151
 
152
 
153
+ # 音声データの転記
154
+ # def transcribe(microphone, file_upload):
155
+ # warn_output = ""
156
+ # if (microphone is not None) and (file_upload is not None):
157
+ # warn_output = (
158
+ # "WARNING: You've uploaded an audio file and used the microphone. "
159
+ # "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
160
+ # )
161
+ #
162
+ # elif (microphone is None) and (file_upload is None):
163
+ # return "ERROR: You have to either use the microphone or upload an audio file"
164
+ #
165
+ # file = microphone if microphone is not None else file_upload
166
+ #
167
+ # text = pipe(file)["text"]
168
+ #
169
+ # return warn_output + text
170
+
171
+
172
+ # YouTubeの埋め込みプレーヤーを表示するHTMLコードを生成する
173
+ # def _return_yt_html_embed(yt_url):
174
+ # video_id = yt_url.split("?v=")[-1]
175
+ # HTML_str = (
176
+ # f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
177
+ # " </center>"
178
+ # )
179
+ # return HTML_str
180
+
181
+
182
+ # YouTubeのビデオから音声をダウンロードし、音声データを使用して転写を行う
183
+ # def yt_transcribe(yt_url):
184
+ # # yt = YouTube(yt_url)
185
+ # # html_embed_str = _return_yt_html_embed(yt_url)
186
+ # # stream = yt.streams.filter(only_audio=True)[0]
187
+ # # stream.download(filename="audio.mp3")
188
+ #
189
+ # ydl_opts = {
190
+ # 'format': 'bestvideo*+bestaudio/best',
191
+ # 'postprocessors': [{
192
+ # 'key': 'FFmpegExtractAudio',
193
+ # 'preferredcodec': 'mp3',
194
+ # 'preferredquality': '192',
195
+ # }],
196
+ # 'outtmpl': 'audio.%(ext)s',
197
+ # }
198
+ #
199
+ # with yt_dlp.YoutubeDL(ydl_opts) as ydl:
200
+ # ydl.download([yt_url])
201
+ #
202
+ # text = pipe("audio.mp3")["text"]
203
+ # return html_embed_str, text
204
+
205
+
206
+ # 秒数を時刻表記に変換
207
  def convert_time(secs):
208
  return datetime.timedelta(seconds=round(secs))
209
 
210
 
211
+ # YouTubeのビデオをダウンロードする
212
+ # def get_youtube(video_url):
213
+ # # yt = YouTube(video_url)
214
+ # # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
215
+ #
216
+ # ydl_opts = {
217
+ # 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
218
+ # }
219
+ #
220
+ # with yt_dlp.YoutubeDL(ydl_opts) as ydl:
221
+ # info = ydl.extract_info(video_url, download=False)
222
+ # abs_video_path = ydl.prepare_filename(info)
223
+ # ydl.process_info(info)
224
+ #
225
+ # print("Success download video")
226
+ # print(abs_video_path)
227
+ # return abs_video_path
228
+
229
+ # 音声をテキストに変換
230
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
231
  """
232
  # Transcribe youtube link using OpenAI Whisper
 
379
  demo.encrypt = False
380
 
381
  with demo:
382
+ with gr.Tab("Whisper speaker diarization"):
383
+ # gr.Markdown('''
384
+ # <div>
385
+ # <h1 style='text-align: center'>Whisper speaker diarization</h1>
386
+ # This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
387
+ # and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
388
+ # </div>
389
+ # ''')
390
+ #
391
+ # with gr.Row():
392
+ # gr.Markdown('''
393
+ # ### Transcribe youtube link using OpenAI Whisper
394
+ # ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
395
+ # ##### 2. Generating speaker embeddings for each segments.
396
+ # ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
397
+ # ''')
398
+ #
399
+ # with gr.Row():
400
+ # gr.Markdown('''
401
+ # ### You can test by following examples:
402
+ # ''')
403
+ # examples = gr.Examples(examples=
404
+ # ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
405
+ # "https://www.youtube.com/watch?v=-UX0X45sYe4",
406
+ # "https://www.youtube.com/watch?v=7minSgqi-Gw"],
407
+ # label="Examples", inputs=[youtube_url_in])
408
+ #
409
+ # with gr.Row():
410
+ # with gr.Column():
411
+ # youtube_url_in.render()
412
+ # download_youtube_btn = gr.Button("Download Youtube video")
413
+ # download_youtube_btn.click(get_youtube, [youtube_url_in], [
414
+ # video_in])
415
+ # print(video_in)
416
+
417
+ with gr.Row():
418
  with gr.Column():
419
+ video_in.render()
420
+ with gr.Column():
421
+ gr.Markdown('''
422
+ ##### Here you can start the transcription process.
423
+ ##### Please select the source language for transcription.
424
+ ##### You can select a range of assumed numbers of speakers.
425
+ ''')
426
+ selected_source_lang.render()
427
+ selected_whisper_model.render()
428
+ number_speakers.render()
429
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
430
+ transcribe_btn.click(speech_to_text,
431
+ [video_in, selected_source_lang, selected_whisper_model, number_speakers],
432
+ [transcription_df, system_info, download_transcript]
433
+ )
434
+
435
+ with gr.Row():
436
+ gr.Markdown('''
437
+ ##### Here you will get transcription output
438
+ ##### ''')
439
+
440
+ with gr.Row():
441
+ with gr.Column():
442
+ download_transcript.render()
443
+ transcription_df.render()
444
+ # system_info.render()
445
+ # gr.Markdown(
446
+ # '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
447
+
448
+ # with gr.Tab("Whisper Transcribe Japanese Audio"):
449
+ # gr.Markdown(f'''
450
+ # <div>
451
+ # <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
452
+ # </div>
453
+ # Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
454
+ # checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
455
+ # ''')
456
+ # microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
457
+ # upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
458
+ # transcribe_btn = gr.Button("Transcribe Audio")
459
+ # text_output = gr.Textbox()
460
+ # with gr.Row():
461
+ # gr.Markdown('''
462
+ # ### You can test by following examples:
463
+ # ''')
464
+ # examples = gr.Examples(examples=
465
+ # ["sample1.wav",
466
+ # "sample2.wav",
467
+ # ],
468
+ # label="Examples", inputs=[upload])
469
+ # transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
470
+ #
471
+ # with gr.Tab("Whisper Transcribe Japanese YouTube"):
472
+ # gr.Markdown(f'''
473
+ # <div>
474
+ # <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
475
+ # </div>
476
+ # Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
477
+ # <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
478
+ # ''')
479
+ # youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
480
+ # yt_transcribe_btn = gr.Button("Transcribe YouTube")
481
+ # text_output2 = gr.Textbox()
482
+ # html_output = gr.Markdown()
483
+ # yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
484
 
485
  demo.launch(debug=True)