yama commited on
Commit
fe81c29
·
1 Parent(s): 2c631bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -95
app.py CHANGED
@@ -27,16 +27,107 @@ import contextlib
27
  from transformers import pipeline
28
  import psutil
29
 
30
- import openai
31
- import os
32
- import tempfile
33
- from pydub import AudioSegment
34
-
35
-
36
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
37
  source_languages = {
38
  "en": "English",
 
 
 
 
 
 
39
  "ja": "Japanese",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
 
42
  source_language_list = [key[0] for key in source_languages.items()]
@@ -260,23 +351,6 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
260
  raise RuntimeError("Error Running inference with local model", e)
261
 
262
 
263
- # def create_meeting_summary(openai_key, prompt):
264
- # openai.api_key = openai_key
265
- #
266
- # # 文字起こししたテキストを取得
267
- # system_template = prompt
268
- #
269
- # completion = openai.ChatCompletion.create(
270
- # model="gpt-3.5-turbo",
271
- # messages=[
272
- # {"role": "system", "content": system_template},
273
- # {"role": "user", "content": transcript_text}
274
- # ]
275
- # )
276
- # summary = completion.choices[0].message.content
277
- # return summary
278
-
279
-
280
  # ---- Gradio Layout -----
281
  # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
282
  video_in = gr.Video(label="Video file", mirror_webcam=False)
@@ -300,82 +374,67 @@ demo = gr.Blocks(title=title)
300
  demo.encrypt = False
301
 
302
  with demo:
303
- with gr.Tab("Whisper speaker diarization"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  gr.Markdown('''
305
- <div>
306
- <h1 style='text-align: center'>Whisper speaker diarization</h1>
307
- This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
308
- and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
309
- </div>
310
- ''')
311
-
312
- with gr.Row():
313
- gr.Markdown('''
314
- ### Transcribe youtube link using OpenAI Whisper
315
- ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
316
- ##### 2. Generating speaker embeddings for each segments.
317
- ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
318
  ''')
319
-
320
- with gr.Row():
321
- gr.Markdown('''
322
- ### You can test by following examples:
323
- ''')
324
  examples = gr.Examples(examples=
325
- ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
326
- "https://www.youtube.com/watch?v=-UX0X45sYe4",
327
- "https://www.youtube.com/watch?v=7minSgqi-Gw"],
328
- label="Examples", inputs=[youtube_url_in])
329
-
330
- with gr.Row():
331
- with gr.Column():
332
- youtube_url_in.render()
333
- download_youtube_btn = gr.Button("Download Youtube video")
334
- download_youtube_btn.click(get_youtube, [youtube_url_in], [
335
- video_in])
336
- print(video_in)
337
-
338
- with gr.Row():
 
339
  with gr.Column():
340
- video_in.render()
341
- with gr.Column():
342
- gr.Markdown('''
343
- ##### Here you can start the transcription process.
344
- ##### Please select the source language for transcription.
345
- ##### You can select a range of assumed numbers of speakers.
346
- ''')
347
- selected_source_lang.render()
348
- selected_whisper_model.render()
349
- number_speakers.render()
350
- transcribe_btn = gr.Button("Transcribe audio and diarization")
351
- transcribe_btn.click(speech_to_text,
352
- [video_in, selected_source_lang, selected_whisper_model, number_speakers],
353
- [transcription_df, system_info, download_transcript]
354
- )
355
-
356
- with gr.Row():
357
- gr.Markdown('''
358
- ##### Here you will get transcription output
359
- ##### ''')
360
-
361
- with gr.Row():
362
- with gr.Column():
363
- download_transcript.render()
364
- transcription_df.render()
365
- # system_info.render()
366
- # gr.Markdown(
367
- # '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
368
-
369
- # with gr.Row():
370
- # with gr.Column():
371
- # gr.Textbox(lines=1, label="openai_key", type="password")
372
- # gr.TextArea(label="prompt", value="""会議の文字起こしが渡されます。
373
- #
374
- # この会議のサマリーをMarkdown形式で作成してください。サマリーは、以下のような形式で書いてください。
375
- # - 会議の目的
376
- # - 会議の内容
377
- # - 会議の結果""")
378
- # gr.Textbox(label="transcription_summary")
379
 
380
 
381
  demo.launch(debug=True)
 
27
  from transformers import pipeline
28
  import psutil
29
 
 
 
 
 
 
 
30
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
31
  source_languages = {
32
  "en": "English",
33
+ # "zh": "Chinese",
34
+ # "de": "German",
35
+ # "es": "Spanish",
36
+ # "ru": "Russian",
37
+ # "ko": "Korean",
38
+ # "fr": "French",
39
  "ja": "Japanese",
40
+ # "pt": "Portuguese",
41
+ # "tr": "Turkish",
42
+ # "pl": "Polish",
43
+ # "ca": "Catalan",
44
+ # "nl": "Dutch",
45
+ # "ar": "Arabic",
46
+ # "sv": "Swedish",
47
+ # "it": "Italian",
48
+ # "id": "Indonesian",
49
+ # "hi": "Hindi",
50
+ # "fi": "Finnish",
51
+ # "vi": "Vietnamese",
52
+ # "he": "Hebrew",
53
+ # "uk": "Ukrainian",
54
+ # "el": "Greek",
55
+ # "ms": "Malay",
56
+ # "cs": "Czech",
57
+ # "ro": "Romanian",
58
+ # "da": "Danish",
59
+ # "hu": "Hungarian",
60
+ # "ta": "Tamil",
61
+ # "no": "Norwegian",
62
+ # "th": "Thai",
63
+ # "ur": "Urdu",
64
+ # "hr": "Croatian",
65
+ # "bg": "Bulgarian",
66
+ # "lt": "Lithuanian",
67
+ # "la": "Latin",
68
+ # "mi": "Maori",
69
+ # "ml": "Malayalam",
70
+ # "cy": "Welsh",
71
+ # "sk": "Slovak",
72
+ # "te": "Telugu",
73
+ # "fa": "Persian",
74
+ # "lv": "Latvian",
75
+ # "bn": "Bengali",
76
+ # "sr": "Serbian",
77
+ # "az": "Azerbaijani",
78
+ # "sl": "Slovenian",
79
+ # "kn": "Kannada",
80
+ # "et": "Estonian",
81
+ # "mk": "Macedonian",
82
+ # "br": "Breton",
83
+ # "eu": "Basque",
84
+ # "is": "Icelandic",
85
+ # "hy": "Armenian",
86
+ # "ne": "Nepali",
87
+ # "mn": "Mongolian",
88
+ # "bs": "Bosnian",
89
+ # "kk": "Kazakh",
90
+ # "sq": "Albanian",
91
+ # "sw": "Swahili",
92
+ # "gl": "Galician",
93
+ # "mr": "Marathi",
94
+ # "pa": "Punjabi",
95
+ # "si": "Sinhala",
96
+ # "km": "Khmer",
97
+ # "sn": "Shona",
98
+ # "yo": "Yoruba",
99
+ # "so": "Somali",
100
+ # "af": "Afrikaans",
101
+ # "oc": "Occitan",
102
+ # "ka": "Georgian",
103
+ # "be": "Belarusian",
104
+ # "tg": "Tajik",
105
+ # "sd": "Sindhi",
106
+ # "gu": "Gujarati",
107
+ # "am": "Amharic",
108
+ # "yi": "Yiddish",
109
+ # "lo": "Lao",
110
+ # "uz": "Uzbek",
111
+ # "fo": "Faroese",
112
+ # "ht": "Haitian creole",
113
+ # "ps": "Pashto",
114
+ # "tk": "Turkmen",
115
+ # "nn": "Nynorsk",
116
+ # "mt": "Maltese",
117
+ # "sa": "Sanskrit",
118
+ # "lb": "Luxembourgish",
119
+ # "my": "Myanmar",
120
+ # "bo": "Tibetan",
121
+ # "tl": "Tagalog",
122
+ # "mg": "Malagasy",
123
+ # "as": "Assamese",
124
+ # "tt": "Tatar",
125
+ # "haw": "Hawaiian",
126
+ # "ln": "Lingala",
127
+ # "ha": "Hausa",
128
+ # "ba": "Bashkir",
129
+ # "jw": "Javanese",
130
+ # "su": "Sundanese",
131
  }
132
 
133
  source_language_list = [key[0] for key in source_languages.items()]
 
351
  raise RuntimeError("Error Running inference with local model", e)
352
 
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  # ---- Gradio Layout -----
355
  # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
356
  video_in = gr.Video(label="Video file", mirror_webcam=False)
 
374
  demo.encrypt = False
375
 
376
  with demo:
377
+ # gr.Markdown('''
378
+ # <div>
379
+ # <h1 style='text-align: center'>Whisper speaker diarization</h1>
380
+ # This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
381
+ # and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
382
+ # </div>
383
+ # ''')
384
+ #
385
+ # with gr.Row():
386
+ # gr.Markdown('''
387
+ # ### Transcribe youtube link using OpenAI Whisper
388
+ # ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
389
+ # ##### 2. Generating speaker embeddings for each segments.
390
+ # ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
391
+ # ''')
392
+
393
+ with gr.Row():
394
  gr.Markdown('''
395
+ ### You can test by following examples:
 
 
 
 
 
 
 
 
 
 
 
 
396
  ''')
 
 
 
 
 
397
  examples = gr.Examples(examples=
398
+ ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
399
+ "https://www.youtube.com/watch?v=-UX0X45sYe4",
400
+ "https://www.youtube.com/watch?v=7minSgqi-Gw"],
401
+ label="Examples", inputs=[youtube_url_in])
402
+
403
+ with gr.Row():
404
+ with gr.Column():
405
+ youtube_url_in.render()
406
+ download_youtube_btn = gr.Button("Download Youtube video")
407
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
408
+ print(video_in)
409
+
410
+ with gr.Row():
411
+ with gr.Column():
412
+ video_in.render()
413
  with gr.Column():
414
+ gr.Markdown('''
415
+ ##### Here you can start the transcription process.
416
+ ##### Please select the source language for transcription.
417
+ ##### You can select a range of assumed numbers of speakers.
418
+ ''')
419
+ selected_source_lang.render()
420
+ selected_whisper_model.render()
421
+ number_speakers.render()
422
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
423
+ transcribe_btn.click(speech_to_text,
424
+ [video_in, selected_source_lang, selected_whisper_model, number_speakers],
425
+ [transcription_df, system_info, download_transcript]
426
+ )
427
+
428
+ with gr.Row():
429
+ gr.Markdown('''
430
+ ##### Here you will get transcription output
431
+ ##### ''')
432
+
433
+ with gr.Row():
434
+ with gr.Column():
435
+ download_transcript.render()
436
+ transcription_df.render()
437
+ # system_info.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
 
440
  demo.launch(debug=True)