yama commited on
Commit
cf83e63
·
1 Parent(s): 1de0e6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +301 -274
app.py CHANGED
@@ -1,142 +1,133 @@
1
  # import whisper
2
  from faster_whisper import WhisperModel
3
  import datetime
4
- # import subprocess
5
  import gradio as gr
6
- # from pathlib import Path
7
  import pandas as pd
8
- # import re
9
  import time
10
  import os
11
  import numpy as np
12
  from sklearn.cluster import AgglomerativeClustering
13
  from sklearn.metrics import silhouette_score
14
 
15
- # from pytube import YouTube
16
- # import yt_dlp
17
  import torch
18
  import pyannote.audio
19
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
20
  from pyannote.audio import Audio
21
  from pyannote.core import Segment
22
 
23
- # from gpuinfo import GPUInfo
24
 
25
  import wave
26
  import contextlib
27
  from transformers import pipeline
28
- # import psutil
29
-
30
- # import gradio as gr
31
- import openai
32
- import os
33
- # from io import BytesIO
34
- import tempfile
35
- from pydub import AudioSegment
36
- # import shutil
37
-
38
 
39
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
40
  source_languages = {
41
  "en": "English",
42
- # "zh": "Chinese",
43
- # "de": "German",
44
- # "es": "Spanish",
45
- # "ru": "Russian",
46
- # "ko": "Korean",
47
- # "fr": "French",
48
  "ja": "Japanese",
49
- # "pt": "Portuguese",
50
- # "tr": "Turkish",
51
- # "pl": "Polish",
52
- # "ca": "Catalan",
53
- # "nl": "Dutch",
54
- # "ar": "Arabic",
55
- # "sv": "Swedish",
56
- # "it": "Italian",
57
- # "id": "Indonesian",
58
- # "hi": "Hindi",
59
- # "fi": "Finnish",
60
- # "vi": "Vietnamese",
61
- # "he": "Hebrew",
62
- # "uk": "Ukrainian",
63
- # "el": "Greek",
64
- # "ms": "Malay",
65
- # "cs": "Czech",
66
- # "ro": "Romanian",
67
- # "da": "Danish",
68
- # "hu": "Hungarian",
69
- # "ta": "Tamil",
70
- # "no": "Norwegian",
71
- # "th": "Thai",
72
- # "ur": "Urdu",
73
- # "hr": "Croatian",
74
- # "bg": "Bulgarian",
75
- # "lt": "Lithuanian",
76
- # "la": "Latin",
77
- # "mi": "Maori",
78
- # "ml": "Malayalam",
79
- # "cy": "Welsh",
80
- # "sk": "Slovak",
81
- # "te": "Telugu",
82
- # "fa": "Persian",
83
- # "lv": "Latvian",
84
- # "bn": "Bengali",
85
- # "sr": "Serbian",
86
- # "az": "Azerbaijani",
87
- # "sl": "Slovenian",
88
- # "kn": "Kannada",
89
- # "et": "Estonian",
90
- # "mk": "Macedonian",
91
- # "br": "Breton",
92
- # "eu": "Basque",
93
- # "is": "Icelandic",
94
- # "hy": "Armenian",
95
- # "ne": "Nepali",
96
- # "mn": "Mongolian",
97
- # "bs": "Bosnian",
98
- # "kk": "Kazakh",
99
- # "sq": "Albanian",
100
- # "sw": "Swahili",
101
- # "gl": "Galician",
102
- # "mr": "Marathi",
103
- # "pa": "Punjabi",
104
- # "si": "Sinhala",
105
- # "km": "Khmer",
106
- # "sn": "Shona",
107
- # "yo": "Yoruba",
108
- # "so": "Somali",
109
- # "af": "Afrikaans",
110
- # "oc": "Occitan",
111
- # "ka": "Georgian",
112
- # "be": "Belarusian",
113
- # "tg": "Tajik",
114
- # "sd": "Sindhi",
115
- # "gu": "Gujarati",
116
- # "am": "Amharic",
117
- # "yi": "Yiddish",
118
- # "lo": "Lao",
119
- # "uz": "Uzbek",
120
- # "fo": "Faroese",
121
- # "ht": "Haitian creole",
122
- # "ps": "Pashto",
123
- # "tk": "Turkmen",
124
- # "nn": "Nynorsk",
125
- # "mt": "Maltese",
126
- # "sa": "Sanskrit",
127
- # "lb": "Luxembourgish",
128
- # "my": "Myanmar",
129
- # "bo": "Tibetan",
130
- # "tl": "Tagalog",
131
- # "mg": "Malagasy",
132
- # "as": "Assamese",
133
- # "tt": "Tatar",
134
- # "haw": "Hawaiian",
135
- # "ln": "Lingala",
136
- # "ha": "Hausa",
137
- # "ba": "Bashkir",
138
- # "jw": "Javanese",
139
- # "su": "Sundanese",
140
  }
141
 
142
  source_language_list = [key[0] for key in source_languages.items()]
@@ -145,7 +136,6 @@ MODEL_NAME = "vumichien/whisper-medium-jp"
145
  lang = "ja"
146
 
147
  device = 0 if torch.cuda.is_available() else "cpu"
148
- # device = "cpu"
149
  pipe = pipeline(
150
  task="automatic-speech-recognition",
151
  model=MODEL_NAME,
@@ -157,80 +147,79 @@ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(lan
157
 
158
  embedding_model = PretrainedSpeakerEmbedding(
159
  "speechbrain/spkrec-ecapa-voxceleb",
160
- device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  )
 
 
162
 
 
 
 
 
 
163
 
164
- # def transcribe(microphone, file_upload):
165
- # warn_output = ""
166
- # if (microphone is not None) and (file_upload is not None):
167
- # warn_output = (
168
- # "WARNING: You've uploaded an audio file and used the microphone. "
169
- # "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
170
- # )
171
- #
172
- # elif (microphone is None) and (file_upload is None):
173
- # return "ERROR: You have to either use the microphone or upload an audio file"
174
- #
175
- # file = microphone if microphone is not None else file_upload
176
- #
177
- # text = pipe(file)["text"]
178
- #
179
- # return warn_output + text
180
-
181
-
182
- # def _return_yt_html_embed(yt_url):
183
- # video_id = yt_url.split("?v=")[-1]
184
- # HTML_str = (
185
- # f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
186
- # " </center>"
187
- # )
188
- # return HTML_str
189
-
190
-
191
- # def yt_transcribe(yt_url):
192
- # # yt = YouTube(yt_url)
193
- # # html_embed_str = _return_yt_html_embed(yt_url)
194
- # # stream = yt.streams.filter(only_audio=True)[0]
195
- # # stream.download(filename="audio.mp3")
196
- #
197
- # ydl_opts = {
198
- # 'format': 'bestvideo*+bestaudio/best',
199
- # 'postprocessors': [{
200
- # 'key': 'FFmpegExtractAudio',
201
- # 'preferredcodec': 'mp3',
202
- # 'preferredquality': '192',
203
- # }],
204
- # 'outtmpl': 'audio.%(ext)s',
205
- # }
206
- #
207
- # with yt_dlp.YoutubeDL(ydl_opts) as ydl:
208
- # ydl.download([yt_url])
209
- #
210
- # text = pipe("audio.mp3")["text"]
211
- # return html_embed_str, text
212
 
213
 
214
  def convert_time(secs):
215
  return datetime.timedelta(seconds=round(secs))
216
 
217
 
218
- # def get_youtube(video_url):
219
- # # yt = YouTube(video_url)
220
- # # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
221
- #
222
- # ydl_opts = {
223
- # 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
224
- # }
225
- #
226
- # with yt_dlp.YoutubeDL(ydl_opts) as ydl:
227
- # info = ydl.extract_info(video_url, download=False)
228
- # abs_video_path = ydl.prepare_filename(info)
229
- # ydl.process_info(info)
230
- #
231
- # print("Success download video")
232
- # print(abs_video_path)
233
- # return abs_video_path
234
 
235
 
236
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
@@ -344,110 +333,148 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
344
 
345
  time_end = time.time()
346
  time_diff = time_end - time_start
347
- # memory = psutil.virtual_memory()
348
- # gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
349
- # gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
350
- # gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
351
- # system_info = f"""
352
- # *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
353
- # *Processing time: {time_diff:.5} seconds.*
354
- # *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
355
- # """
356
  save_path = "output/transcript_result.csv"
357
  df_results = pd.DataFrame(objects)
358
  df_results.to_csv(save_path)
359
- # return df_results, system_info, save_path
360
- return df_results, save_path
361
 
362
  except Exception as e:
363
  raise RuntimeError("Error Running inference with local model", e)
364
 
365
 
366
- def create_meeting_summary(openai_key, prompt, uploaded_audio, max_transcribe_seconds):
367
- openai.api_key = openai_key
368
-
369
- # 音声ファイルを開く
370
- audio = AudioSegment.from_file(uploaded_audio)
371
-
372
- # 文字起こしする音声データの上限を設定する
373
- if len(audio) > int(max_transcribe_seconds) * 1000:
374
- audio = audio[:int(max_transcribe_seconds) * 1000]
375
-
376
- # ファイルサイズを削減するために音声ファイルを圧縮する
377
- compressed_audio = audio.set_frame_rate(16000).set_channels(1)
378
-
379
- # 圧縮した音声ファイルをmp3形式で一時ファイルに保存する
380
- with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as tmp:
381
- compressed_audio.export(tmp.name, format="mp3")
382
-
383
- transcript = openai.Audio.transcribe("whisper-1", open(tmp.name, "rb"), response_format="verbose_json")
384
- transcript_text = ""
385
- for segment in transcript.segments:
386
- transcript_text += f"{segment['text']}\n"
387
-
388
- system_template = prompt
389
-
390
- completion = openai.ChatCompletion.create(
391
- model="gpt-3.5-turbo",
392
- messages=[
393
- {"role": "system", "content": system_template},
394
- {"role": "user", "content": transcript_text}
395
- ]
396
- )
397
- summary = completion.choices[0].message.content
398
- return summary, transcript_text
399
-
400
-
401
  # ---- Gradio Layout -----
 
402
  video_in = gr.Video(label="Video file", mirror_webcam=False)
403
- # youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
404
  df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
405
- # memory = psutil.virtual_memory()
406
- selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="ja", label="Spoken language in video", interactive=True)
407
- selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
408
- number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
409
- # system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 
 
 
 
 
410
  download_transcript = gr.File(label="Download transcript")
411
- transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
412
- # title = "Whisper speaker diarization"
413
- # demo = gr.Blocks(title=title)
414
- # demo.encrypt = False
415
-
416
-
417
- inputs = [
418
- gr.Textbox(lines=1, label="openai_key", type="password"),
419
- gr.TextArea(label="summary prompt", value="""会議の文字起こしが渡されます。
420
-
421
- この会議のサマリーをMarkdown形式で作成してください。サマリーは、以下のような形式で書いてください。
422
- - 会議の目的
423
- - 会議の内容
424
- - 会議の結果
425
- """),
426
- # gr.Audio(type="filepath", label="音声ファイルをアップロード"),
427
- video_in.render(),
428
- gr.Textbox(lines=1, label="maximum transcription time (seconds)", type="text"),
429
- selected_source_lang.render(),
430
- selected_whisper_model.render(),
431
- number_speakers.render(),
432
- ]
433
-
434
- outputs = [
435
- gr.Textbox(label="会議サマリー"),
436
- gr.Textbox(label="文字起こし")
437
- ]
438
-
439
- app = gr.Interface(
440
- fn=create_meeting_summary,
441
- inputs=inputs,
442
- outputs=outputs,
443
- title="会議サマリー生成アプリ",
444
- description="音声ファイルをアップロードして、会議のサマリーをMarkdown形式で作成します。"
445
- )
446
-
447
- transcribe_btn = gr.Button("Transcribe audio and diarization")
448
- transcribe_btn.click(speech_to_text,
449
- [video_in, selected_source_lang, selected_whisper_model, number_speakers],
450
- [transcription_df, download_transcript]
451
- )
452
-
453
- app.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # import whisper
2
  from faster_whisper import WhisperModel
3
  import datetime
4
+ import subprocess
5
  import gradio as gr
6
+ from pathlib import Path
7
  import pandas as pd
8
+ import re
9
  import time
10
  import os
11
  import numpy as np
12
  from sklearn.cluster import AgglomerativeClustering
13
  from sklearn.metrics import silhouette_score
14
 
15
+ from pytube import YouTube
16
+ import yt_dlp
17
  import torch
18
  import pyannote.audio
19
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
20
  from pyannote.audio import Audio
21
  from pyannote.core import Segment
22
 
23
+ from gpuinfo import GPUInfo
24
 
25
  import wave
26
  import contextlib
27
  from transformers import pipeline
28
+ import psutil
 
 
 
 
 
 
 
 
 
29
 
30
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
31
  source_languages = {
32
  "en": "English",
33
+ "zh": "Chinese",
34
+ "de": "German",
35
+ "es": "Spanish",
36
+ "ru": "Russian",
37
+ "ko": "Korean",
38
+ "fr": "French",
39
  "ja": "Japanese",
40
+ "pt": "Portuguese",
41
+ "tr": "Turkish",
42
+ "pl": "Polish",
43
+ "ca": "Catalan",
44
+ "nl": "Dutch",
45
+ "ar": "Arabic",
46
+ "sv": "Swedish",
47
+ "it": "Italian",
48
+ "id": "Indonesian",
49
+ "hi": "Hindi",
50
+ "fi": "Finnish",
51
+ "vi": "Vietnamese",
52
+ "he": "Hebrew",
53
+ "uk": "Ukrainian",
54
+ "el": "Greek",
55
+ "ms": "Malay",
56
+ "cs": "Czech",
57
+ "ro": "Romanian",
58
+ "da": "Danish",
59
+ "hu": "Hungarian",
60
+ "ta": "Tamil",
61
+ "no": "Norwegian",
62
+ "th": "Thai",
63
+ "ur": "Urdu",
64
+ "hr": "Croatian",
65
+ "bg": "Bulgarian",
66
+ "lt": "Lithuanian",
67
+ "la": "Latin",
68
+ "mi": "Maori",
69
+ "ml": "Malayalam",
70
+ "cy": "Welsh",
71
+ "sk": "Slovak",
72
+ "te": "Telugu",
73
+ "fa": "Persian",
74
+ "lv": "Latvian",
75
+ "bn": "Bengali",
76
+ "sr": "Serbian",
77
+ "az": "Azerbaijani",
78
+ "sl": "Slovenian",
79
+ "kn": "Kannada",
80
+ "et": "Estonian",
81
+ "mk": "Macedonian",
82
+ "br": "Breton",
83
+ "eu": "Basque",
84
+ "is": "Icelandic",
85
+ "hy": "Armenian",
86
+ "ne": "Nepali",
87
+ "mn": "Mongolian",
88
+ "bs": "Bosnian",
89
+ "kk": "Kazakh",
90
+ "sq": "Albanian",
91
+ "sw": "Swahili",
92
+ "gl": "Galician",
93
+ "mr": "Marathi",
94
+ "pa": "Punjabi",
95
+ "si": "Sinhala",
96
+ "km": "Khmer",
97
+ "sn": "Shona",
98
+ "yo": "Yoruba",
99
+ "so": "Somali",
100
+ "af": "Afrikaans",
101
+ "oc": "Occitan",
102
+ "ka": "Georgian",
103
+ "be": "Belarusian",
104
+ "tg": "Tajik",
105
+ "sd": "Sindhi",
106
+ "gu": "Gujarati",
107
+ "am": "Amharic",
108
+ "yi": "Yiddish",
109
+ "lo": "Lao",
110
+ "uz": "Uzbek",
111
+ "fo": "Faroese",
112
+ "ht": "Haitian creole",
113
+ "ps": "Pashto",
114
+ "tk": "Turkmen",
115
+ "nn": "Nynorsk",
116
+ "mt": "Maltese",
117
+ "sa": "Sanskrit",
118
+ "lb": "Luxembourgish",
119
+ "my": "Myanmar",
120
+ "bo": "Tibetan",
121
+ "tl": "Tagalog",
122
+ "mg": "Malagasy",
123
+ "as": "Assamese",
124
+ "tt": "Tatar",
125
+ "haw": "Hawaiian",
126
+ "ln": "Lingala",
127
+ "ha": "Hausa",
128
+ "ba": "Bashkir",
129
+ "jw": "Javanese",
130
+ "su": "Sundanese",
131
  }
132
 
133
  source_language_list = [key[0] for key in source_languages.items()]
 
136
  lang = "ja"
137
 
138
  device = 0 if torch.cuda.is_available() else "cpu"
 
139
  pipe = pipeline(
140
  task="automatic-speech-recognition",
141
  model=MODEL_NAME,
 
147
 
148
  embedding_model = PretrainedSpeakerEmbedding(
149
  "speechbrain/spkrec-ecapa-voxceleb",
150
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
151
+
152
+
153
+ def transcribe(microphone, file_upload):
154
+ warn_output = ""
155
+ if (microphone is not None) and (file_upload is not None):
156
+ warn_output = (
157
+ "WARNING: You've uploaded an audio file and used the microphone. "
158
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
159
+ )
160
+
161
+ elif (microphone is None) and (file_upload is None):
162
+ return "ERROR: You have to either use the microphone or upload an audio file"
163
+
164
+ file = microphone if microphone is not None else file_upload
165
+
166
+ text = pipe(file)["text"]
167
+
168
+ return warn_output + text
169
+
170
+
171
+ def _return_yt_html_embed(yt_url):
172
+ video_id = yt_url.split("?v=")[-1]
173
+ HTML_str = (
174
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
175
+ " </center>"
176
  )
177
+ return HTML_str
178
+
179
 
180
+ def yt_transcribe(yt_url):
181
+ # yt = YouTube(yt_url)
182
+ # html_embed_str = _return_yt_html_embed(yt_url)
183
+ # stream = yt.streams.filter(only_audio=True)[0]
184
+ # stream.download(filename="audio.mp3")
185
 
186
+ ydl_opts = {
187
+ 'format': 'bestvideo*+bestaudio/best',
188
+ 'postprocessors': [{
189
+ 'key': 'FFmpegExtractAudio',
190
+ 'preferredcodec': 'mp3',
191
+ 'preferredquality': '192',
192
+ }],
193
+ 'outtmpl': 'audio.%(ext)s',
194
+ }
195
+
196
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
197
+ ydl.download([yt_url])
198
+
199
+ text = pipe("audio.mp3")["text"]
200
+ return html_embed_str, text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
 
203
  def convert_time(secs):
204
  return datetime.timedelta(seconds=round(secs))
205
 
206
 
207
+ def get_youtube(video_url):
208
+ # yt = YouTube(video_url)
209
+ # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
210
+
211
+ ydl_opts = {
212
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
213
+ }
214
+
215
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
216
+ info = ydl.extract_info(video_url, download=False)
217
+ abs_video_path = ydl.prepare_filename(info)
218
+ ydl.process_info(info)
219
+
220
+ print("Success download video")
221
+ print(abs_video_path)
222
+ return abs_video_path
223
 
224
 
225
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
 
333
 
334
  time_end = time.time()
335
  time_diff = time_end - time_start
336
+ memory = psutil.virtual_memory()
337
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
338
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
339
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
340
+ system_info = f"""
341
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
342
+ *Processing time: {time_diff:.5} seconds.*
343
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
344
+ """
345
  save_path = "output/transcript_result.csv"
346
  df_results = pd.DataFrame(objects)
347
  df_results.to_csv(save_path)
348
+ return df_results, system_info, save_path
 
349
 
350
  except Exception as e:
351
  raise RuntimeError("Error Running inference with local model", e)
352
 
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  # ---- Gradio Layout -----
355
+ # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
356
  video_in = gr.Video(label="Video file", mirror_webcam=False)
357
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
358
  df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
359
+ memory = psutil.virtual_memory()
360
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en",
361
+ label="Spoken language in video", interactive=True)
362
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model",
363
+ interactive=True)
364
+ number_speakers = gr.Number(precision=0, value=0,
365
+ label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers",
366
+ interactive=True)
367
+ system_info = gr.Markdown(
368
+ f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
369
  download_transcript = gr.File(label="Download transcript")
370
+ transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=(0, "dynamic"), max_rows=10,
371
+ wrap=True, overflow_row_behaviour='paginate')
372
+ title = "Whisper speaker diarization"
373
+ demo = gr.Blocks(title=title)
374
+ demo.encrypt = False
375
+
376
+ with demo:
377
+ with gr.Tab("Whisper speaker diarization"):
378
+ gr.Markdown('''
379
+ <div>
380
+ <h1 style='text-align: center'>Whisper speaker diarization</h1>
381
+ This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
382
+ and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
383
+ </div>
384
+ ''')
385
+
386
+ with gr.Row():
387
+ gr.Markdown('''
388
+ ### Transcribe youtube link using OpenAI Whisper
389
+ ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
390
+ ##### 2. Generating speaker embeddings for each segments.
391
+ ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
392
+ ''')
393
+
394
+ with gr.Row():
395
+ gr.Markdown('''
396
+ ### You can test by following examples:
397
+ ''')
398
+ examples = gr.Examples(examples=
399
+ ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
400
+ "https://www.youtube.com/watch?v=-UX0X45sYe4",
401
+ "https://www.youtube.com/watch?v=7minSgqi-Gw"],
402
+ label="Examples", inputs=[youtube_url_in])
403
+
404
+ with gr.Row():
405
+ with gr.Column():
406
+ youtube_url_in.render()
407
+ download_youtube_btn = gr.Button("Download Youtube video")
408
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
409
+ video_in])
410
+ print(video_in)
411
+
412
+ with gr.Row():
413
+ with gr.Column():
414
+ video_in.render()
415
+ with gr.Column():
416
+ gr.Markdown('''
417
+ ##### Here you can start the transcription process.
418
+ ##### Please select the source language for transcription.
419
+ ##### You can select a range of assumed numbers of speakers.
420
+ ''')
421
+ selected_source_lang.render()
422
+ selected_whisper_model.render()
423
+ number_speakers.render()
424
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
425
+ transcribe_btn.click(speech_to_text,
426
+ [video_in, selected_source_lang, selected_whisper_model, number_speakers],
427
+ [transcription_df, system_info, download_transcript]
428
+ )
429
+
430
+ with gr.Row():
431
+ gr.Markdown('''
432
+ ##### Here you will get transcription output
433
+ ##### ''')
434
+
435
+ with gr.Row():
436
+ with gr.Column():
437
+ download_transcript.render()
438
+ transcription_df.render()
439
+ system_info.render()
440
+ gr.Markdown(
441
+ '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
442
+
443
+ with gr.Tab("Whisper Transcribe Japanese Audio"):
444
+ gr.Markdown(f'''
445
+ <div>
446
+ <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
447
+ </div>
448
+ Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
449
+ checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
450
+ ''')
451
+ microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
452
+ upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
453
+ transcribe_btn = gr.Button("Transcribe Audio")
454
+ text_output = gr.Textbox()
455
+ with gr.Row():
456
+ gr.Markdown('''
457
+ ### You can test by following examples:
458
+ ''')
459
+ examples = gr.Examples(examples=
460
+ ["sample1.wav",
461
+ "sample2.wav",
462
+ ],
463
+ label="Examples", inputs=[upload])
464
+ transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
465
+
466
+ with gr.Tab("Whisper Transcribe Japanese YouTube"):
467
+ gr.Markdown(f'''
468
+ <div>
469
+ <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
470
+ </div>
471
+ Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
472
+ <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
473
+ ''')
474
+ youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
475
+ yt_transcribe_btn = gr.Button("Transcribe YouTube")
476
+ text_output2 = gr.Textbox()
477
+ html_output = gr.Markdown()
478
+ yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
479
+
480
+ demo.launch(debug=True)