yama commited on
Commit
1de0e6b
·
1 Parent(s): bef704a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -148
app.py CHANGED
@@ -1,35 +1,43 @@
1
  # import whisper
2
  from faster_whisper import WhisperModel
3
  import datetime
4
- import subprocess
5
  import gradio as gr
6
- from pathlib import Path
7
  import pandas as pd
8
- import re
9
  import time
10
  import os
11
  import numpy as np
12
  from sklearn.cluster import AgglomerativeClustering
13
  from sklearn.metrics import silhouette_score
14
 
15
- from pytube import YouTube
16
- import yt_dlp
17
  import torch
18
  import pyannote.audio
19
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
20
  from pyannote.audio import Audio
21
  from pyannote.core import Segment
22
 
23
- from gpuinfo import GPUInfo
24
 
25
  import wave
26
  import contextlib
27
  from transformers import pipeline
28
- import psutil
 
 
 
 
 
 
 
 
 
29
 
30
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
31
  source_languages = {
32
- "ja": "Japanese",
33
  "en": "English",
34
  # "zh": "Chinese",
35
  # "de": "German",
@@ -37,6 +45,7 @@ source_languages = {
37
  # "ru": "Russian",
38
  # "ko": "Korean",
39
  # "fr": "French",
 
40
  # "pt": "Portuguese",
41
  # "tr": "Turkish",
42
  # "pl": "Polish",
@@ -136,6 +145,7 @@ MODEL_NAME = "vumichien/whisper-medium-jp"
136
  lang = "ja"
137
 
138
  device = 0 if torch.cuda.is_available() else "cpu"
 
139
  pipe = pipeline(
140
  task="automatic-speech-recognition",
141
  model=MODEL_NAME,
@@ -147,10 +157,10 @@ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(lan
147
 
148
  embedding_model = PretrainedSpeakerEmbedding(
149
  "speechbrain/spkrec-ecapa-voxceleb",
150
- device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
 
151
 
152
 
153
- # 音声データの転記
154
  # def transcribe(microphone, file_upload):
155
  # warn_output = ""
156
  # if (microphone is not None) and (file_upload is not None):
@@ -169,7 +179,6 @@ embedding_model = PretrainedSpeakerEmbedding(
169
  # return warn_output + text
170
 
171
 
172
- # YouTubeの埋め込みプレーヤーを表示するHTMLコードを生成する
173
  # def _return_yt_html_embed(yt_url):
174
  # video_id = yt_url.split("?v=")[-1]
175
  # HTML_str = (
@@ -179,7 +188,6 @@ embedding_model = PretrainedSpeakerEmbedding(
179
  # return HTML_str
180
 
181
 
182
- # YouTubeのビデオから音声をダウンロードし、音声データを使用して転写を行う
183
  # def yt_transcribe(yt_url):
184
  # # yt = YouTube(yt_url)
185
  # # html_embed_str = _return_yt_html_embed(yt_url)
@@ -203,12 +211,10 @@ embedding_model = PretrainedSpeakerEmbedding(
203
  # return html_embed_str, text
204
 
205
 
206
- # 秒数を時刻表記に変換
207
  def convert_time(secs):
208
  return datetime.timedelta(seconds=round(secs))
209
 
210
 
211
- # YouTubeのビデオをダウンロードする
212
  # def get_youtube(video_url):
213
  # # yt = YouTube(video_url)
214
  # # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
@@ -226,7 +232,7 @@ def convert_time(secs):
226
  # print(abs_video_path)
227
  # return abs_video_path
228
 
229
- # 音声をテキストに変換
230
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
231
  """
232
  # Transcribe youtube link using OpenAI Whisper
@@ -338,148 +344,110 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
338
 
339
  time_end = time.time()
340
  time_diff = time_end - time_start
341
- memory = psutil.virtual_memory()
342
- gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
343
- gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
344
- gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
345
- system_info = f"""
346
- *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
347
- *Processing time: {time_diff:.5} seconds.*
348
- *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
349
- """
350
  save_path = "output/transcript_result.csv"
351
  df_results = pd.DataFrame(objects)
352
  df_results.to_csv(save_path)
353
- return df_results, system_info, save_path
 
354
 
355
  except Exception as e:
356
  raise RuntimeError("Error Running inference with local model", e)
357
 
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  # ---- Gradio Layout -----
360
- # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
361
  video_in = gr.Video(label="Video file", mirror_webcam=False)
362
- youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
363
  df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
364
- memory = psutil.virtual_memory()
365
- selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="ja",
366
- label="Spoken language in video", interactive=True)
367
- selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model",
368
- interactive=True)
369
- number_speakers = gr.Number(precision=0, value=0,
370
- label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers",
371
- interactive=True)
372
- system_info = gr.Markdown(
373
- f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
374
  download_transcript = gr.File(label="Download transcript")
375
- transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=(0, "dynamic"), max_rows=10,
376
- wrap=True, overflow_row_behaviour='paginate')
377
- title = "Whisper speaker diarization"
378
- demo = gr.Blocks(title=title)
379
- demo.encrypt = False
380
-
381
- with demo:
382
- with gr.Tab("Whisper speaker diarization"):
383
- # gr.Markdown('''
384
- # <div>
385
- # <h1 style='text-align: center'>Whisper speaker diarization</h1>
386
- # This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
387
- # and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
388
- # </div>
389
- # ''')
390
- #
391
- # with gr.Row():
392
- # gr.Markdown('''
393
- # ### Transcribe youtube link using OpenAI Whisper
394
- # ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
395
- # ##### 2. Generating speaker embeddings for each segments.
396
- # ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
397
- # ''')
398
- #
399
- # with gr.Row():
400
- # gr.Markdown('''
401
- # ### You can test by following examples:
402
- # ''')
403
- # examples = gr.Examples(examples=
404
- # ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
405
- # "https://www.youtube.com/watch?v=-UX0X45sYe4",
406
- # "https://www.youtube.com/watch?v=7minSgqi-Gw"],
407
- # label="Examples", inputs=[youtube_url_in])
408
- #
409
- # with gr.Row():
410
- # with gr.Column():
411
- # youtube_url_in.render()
412
- # download_youtube_btn = gr.Button("Download Youtube video")
413
- # download_youtube_btn.click(get_youtube, [youtube_url_in], [
414
- # video_in])
415
- # print(video_in)
416
-
417
- with gr.Row():
418
- with gr.Column():
419
- video_in.render()
420
- with gr.Column():
421
- gr.Markdown('''
422
- ##### Here you can start the transcription process.
423
- ##### Please select the source language for transcription.
424
- ##### You can select a range of assumed numbers of speakers.
425
- ''')
426
- selected_source_lang.render()
427
- selected_whisper_model.render()
428
- number_speakers.render()
429
- transcribe_btn = gr.Button("Transcribe audio and diarization")
430
- transcribe_btn.click(speech_to_text,
431
- [video_in, selected_source_lang, selected_whisper_model, number_speakers],
432
- [transcription_df, system_info, download_transcript]
433
- )
434
-
435
- with gr.Row():
436
- gr.Markdown('''
437
- ##### Here you will get transcription output
438
- ##### ''')
439
-
440
- with gr.Row():
441
- with gr.Column():
442
- download_transcript.render()
443
- transcription_df.render()
444
- # system_info.render()
445
- # gr.Markdown(
446
- # '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
447
-
448
- # with gr.Tab("Whisper Transcribe Japanese Audio"):
449
- # gr.Markdown(f'''
450
- # <div>
451
- # <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
452
- # </div>
453
- # Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
454
- # checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
455
- # ''')
456
- # microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
457
- # upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
458
- # transcribe_btn = gr.Button("Transcribe Audio")
459
- # text_output = gr.Textbox()
460
- # with gr.Row():
461
- # gr.Markdown('''
462
- # ### You can test by following examples:
463
- # ''')
464
- # examples = gr.Examples(examples=
465
- # ["sample1.wav",
466
- # "sample2.wav",
467
- # ],
468
- # label="Examples", inputs=[upload])
469
- # transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
470
- #
471
- # with gr.Tab("Whisper Transcribe Japanese YouTube"):
472
- # gr.Markdown(f'''
473
- # <div>
474
- # <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
475
- # </div>
476
- # Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
477
- # <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
478
- # ''')
479
- # youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
480
- # yt_transcribe_btn = gr.Button("Transcribe YouTube")
481
- # text_output2 = gr.Textbox()
482
- # html_output = gr.Markdown()
483
- # yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
484
-
485
- demo.launch(debug=True)
 
1
  # import whisper
2
  from faster_whisper import WhisperModel
3
  import datetime
4
+ # import subprocess
5
  import gradio as gr
6
+ # from pathlib import Path
7
  import pandas as pd
8
+ # import re
9
  import time
10
  import os
11
  import numpy as np
12
  from sklearn.cluster import AgglomerativeClustering
13
  from sklearn.metrics import silhouette_score
14
 
15
+ # from pytube import YouTube
16
+ # import yt_dlp
17
  import torch
18
  import pyannote.audio
19
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
20
  from pyannote.audio import Audio
21
  from pyannote.core import Segment
22
 
23
+ # from gpuinfo import GPUInfo
24
 
25
  import wave
26
  import contextlib
27
  from transformers import pipeline
28
+ # import psutil
29
+
30
+ # import gradio as gr
31
+ import openai
32
+ import os
33
+ # from io import BytesIO
34
+ import tempfile
35
+ from pydub import AudioSegment
36
+ # import shutil
37
+
38
 
39
  whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
40
  source_languages = {
 
41
  "en": "English",
42
  # "zh": "Chinese",
43
  # "de": "German",
 
45
  # "ru": "Russian",
46
  # "ko": "Korean",
47
  # "fr": "French",
48
+ "ja": "Japanese",
49
  # "pt": "Portuguese",
50
  # "tr": "Turkish",
51
  # "pl": "Polish",
 
145
  lang = "ja"
146
 
147
  device = 0 if torch.cuda.is_available() else "cpu"
148
+ # device = "cpu"
149
  pipe = pipeline(
150
  task="automatic-speech-recognition",
151
  model=MODEL_NAME,
 
157
 
158
  embedding_model = PretrainedSpeakerEmbedding(
159
  "speechbrain/spkrec-ecapa-voxceleb",
160
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
161
+ )
162
 
163
 
 
164
  # def transcribe(microphone, file_upload):
165
  # warn_output = ""
166
  # if (microphone is not None) and (file_upload is not None):
 
179
  # return warn_output + text
180
 
181
 
 
182
  # def _return_yt_html_embed(yt_url):
183
  # video_id = yt_url.split("?v=")[-1]
184
  # HTML_str = (
 
188
  # return HTML_str
189
 
190
 
 
191
  # def yt_transcribe(yt_url):
192
  # # yt = YouTube(yt_url)
193
  # # html_embed_str = _return_yt_html_embed(yt_url)
 
211
  # return html_embed_str, text
212
 
213
 
 
214
  def convert_time(secs):
215
  return datetime.timedelta(seconds=round(secs))
216
 
217
 
 
218
  # def get_youtube(video_url):
219
  # # yt = YouTube(video_url)
220
  # # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
 
232
  # print(abs_video_path)
233
  # return abs_video_path
234
 
235
+
236
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
237
  """
238
  # Transcribe youtube link using OpenAI Whisper
 
344
 
345
  time_end = time.time()
346
  time_diff = time_end - time_start
347
+ # memory = psutil.virtual_memory()
348
+ # gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
349
+ # gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
350
+ # gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
351
+ # system_info = f"""
352
+ # *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
353
+ # *Processing time: {time_diff:.5} seconds.*
354
+ # *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
355
+ # """
356
  save_path = "output/transcript_result.csv"
357
  df_results = pd.DataFrame(objects)
358
  df_results.to_csv(save_path)
359
+ # return df_results, system_info, save_path
360
+ return df_results, save_path
361
 
362
  except Exception as e:
363
  raise RuntimeError("Error Running inference with local model", e)
364
 
365
 
366
+ def create_meeting_summary(openai_key, prompt, uploaded_audio, max_transcribe_seconds):
367
+ openai.api_key = openai_key
368
+
369
+ # 音声ファイルを開く
370
+ audio = AudioSegment.from_file(uploaded_audio)
371
+
372
+ # 文字起こしする音声データの上限を設定する
373
+ if len(audio) > int(max_transcribe_seconds) * 1000:
374
+ audio = audio[:int(max_transcribe_seconds) * 1000]
375
+
376
+ # ファイルサイズを削減するために音声ファイルを圧縮する
377
+ compressed_audio = audio.set_frame_rate(16000).set_channels(1)
378
+
379
+ # 圧縮した音声ファイルをmp3形式で一時ファイルに保存する
380
+ with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as tmp:
381
+ compressed_audio.export(tmp.name, format="mp3")
382
+
383
+ transcript = openai.Audio.transcribe("whisper-1", open(tmp.name, "rb"), response_format="verbose_json")
384
+ transcript_text = ""
385
+ for segment in transcript.segments:
386
+ transcript_text += f"{segment['text']}\n"
387
+
388
+ system_template = prompt
389
+
390
+ completion = openai.ChatCompletion.create(
391
+ model="gpt-3.5-turbo",
392
+ messages=[
393
+ {"role": "system", "content": system_template},
394
+ {"role": "user", "content": transcript_text}
395
+ ]
396
+ )
397
+ summary = completion.choices[0].message.content
398
+ return summary, transcript_text
399
+
400
+
401
  # ---- Gradio Layout -----
 
402
  video_in = gr.Video(label="Video file", mirror_webcam=False)
403
+ # youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
404
  df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
405
+ # memory = psutil.virtual_memory()
406
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="ja", label="Spoken language in video", interactive=True)
407
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
408
+ number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
409
+ # system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 
 
 
 
 
410
  download_transcript = gr.File(label="Download transcript")
411
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
412
+ # title = "Whisper speaker diarization"
413
+ # demo = gr.Blocks(title=title)
414
+ # demo.encrypt = False
415
+
416
+
417
+ inputs = [
418
+ gr.Textbox(lines=1, label="openai_key", type="password"),
419
+ gr.TextArea(label="summary prompt", value="""会議の文字起こしが渡されます。
420
+
421
+ この会議のサマリーをMarkdown形式で作成してください。サマリーは、以下のような形式で書いてください。
422
+ - 会議の目的
423
+ - 会議の内容
424
+ - 会議の結果
425
+ """),
426
+ # gr.Audio(type="filepath", label="音声ファイルをアップロード"),
427
+ video_in.render(),
428
+ gr.Textbox(lines=1, label="maximum transcription time (seconds)", type="text"),
429
+ selected_source_lang.render(),
430
+ selected_whisper_model.render(),
431
+ number_speakers.render(),
432
+ ]
433
+
434
+ outputs = [
435
+ gr.Textbox(label="会議サマリー"),
436
+ gr.Textbox(label="文字起こし")
437
+ ]
438
+
439
+ app = gr.Interface(
440
+ fn=create_meeting_summary,
441
+ inputs=inputs,
442
+ outputs=outputs,
443
+ title="会議サマリー生成アプリ",
444
+ description="音声ファイルをアップロードして、会議のサマリーをMarkdown形式で作成します。"
445
+ )
446
+
447
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
448
+ transcribe_btn.click(speech_to_text,
449
+ [video_in, selected_source_lang, selected_whisper_model, number_speakers],
450
+ [transcription_df, download_transcript]
451
+ )
452
+
453
+ app.launch(debug=True)