Spaces:

yamashiro3
/

Whisper-gpt-voicescribe

Build error

App Files Files Community

yama commited on Jun 30, 2023

Commit

bef704a

1 Parent(s): f742df6

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -130

app.py CHANGED Viewed

@@ -29,6 +29,7 @@ import psutil
 whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
     "en": "English",
     # "zh": "Chinese",
     # "de": "German",
@@ -36,7 +37,6 @@ source_languages = {
     # "ru": "Russian",
     # "ko": "Korean",
     # "fr": "French",
-    "ja": "Japanese",
     # "pt": "Portuguese",
     # "tr": "Turkish",
     # "pl": "Polish",
@@ -150,78 +150,83 @@ embedding_model = PretrainedSpeakerEmbedding(
     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-def transcribe(microphone, file_upload):
-    warn_output = ""
-    if (microphone is not None) and (file_upload is not None):
-        warn_output = (
-            "WARNING: You've uploaded an audio file and used the microphone. "
-            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-        )
-    elif (microphone is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    return warn_output + text
-def _return_yt_html_embed(yt_url):
-    video_id = yt_url.split("?v=")[-1]
-    HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-        " </center>"
-    )
-    return HTML_str
-def yt_transcribe(yt_url):
-    # yt = YouTube(yt_url)
-    # html_embed_str = _return_yt_html_embed(yt_url)
-    # stream = yt.streams.filter(only_audio=True)[0]
-    # stream.download(filename="audio.mp3")
-    ydl_opts = {
-        'format': 'bestvideo*+bestaudio/best',
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'mp3',
-            'preferredquality': '192',
-        }],
-        'outtmpl': 'audio.%(ext)s',
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([yt_url])
-    text = pipe("audio.mp3")["text"]
-    return html_embed_str, text
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
-def get_youtube(video_url):
-    # yt = YouTube(video_url)
-    # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
-    ydl_opts = {
-        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        info = ydl.extract_info(video_url, download=False)
-        abs_video_path = ydl.prepare_filename(info)
-        ydl.process_info(info)
-    print("Success download video")
-    print(abs_video_path)
-    return abs_video_path
 def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
@@ -374,67 +379,107 @@ demo = gr.Blocks(title=title)
 demo.encrypt = False
 with demo:
-    # gr.Markdown('''
-    #     <div>
-    #     <h1 style='text-align: center'>Whisper speaker diarization</h1>
-    #     This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
-    #     and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
-    #     </div>
-    # ''')
-    #
-    # with gr.Row():
-    #     gr.Markdown('''
-    #     ### Transcribe youtube link using OpenAI Whisper
-    #     ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
-    #     ##### 2. Generating speaker embeddings for each segments.
-    #     ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
-    #     ''')
-    with gr.Row():
-        gr.Markdown('''
-            ### You can test by following examples:
-            ''')
-    examples = gr.Examples(examples=
-                           ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
-                            "https://www.youtube.com/watch?v=-UX0X45sYe4",
-                            "https://www.youtube.com/watch?v=7minSgqi-Gw"],
-                           label="Examples", inputs=[youtube_url_in])
-    with gr.Row():
-        with gr.Column():
-            youtube_url_in.render()
-            download_youtube_btn = gr.Button("Download Youtube video")
-            download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
-            print(video_in)
-    with gr.Row():
-        with gr.Column():
-            video_in.render()
             with gr.Column():
-                gr.Markdown('''
-                ##### Here you can start the transcription process.
-                ##### Please select the source language for transcription.
-                ##### You can select a range of assumed numbers of speakers.
-                ''')
-            selected_source_lang.render()
-            selected_whisper_model.render()
-            number_speakers.render()
-            transcribe_btn = gr.Button("Transcribe audio and diarization")
-            transcribe_btn.click(speech_to_text,
-                                 [video_in, selected_source_lang, selected_whisper_model, number_speakers],
-                                 [transcription_df, system_info, download_transcript]
-                                 )
-    with gr.Row():
-        gr.Markdown('''
-        ##### Here you will get transcription  output
-        ##### ''')
-    with gr.Row():
-        with gr.Column():
-            download_transcript.render()
-            transcription_df.render()
-            # system_info.render()
 demo.launch(debug=True)

 whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
+    "ja": "Japanese",
     "en": "English",
     # "zh": "Chinese",
     # "de": "German",
     # "ru": "Russian",
     # "ko": "Korean",
     # "fr": "French",
     # "pt": "Portuguese",
     # "tr": "Turkish",
     # "pl": "Polish",
     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+# 音声データの転記
+# def transcribe(microphone, file_upload):
+#     warn_output = ""
+#     if (microphone is not None) and (file_upload is not None):
+#         warn_output = (
+#             "WARNING: You've uploaded an audio file and used the microphone. "
+#             "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+#         )
+#
+#     elif (microphone is None) and (file_upload is None):
+#         return "ERROR: You have to either use the microphone or upload an audio file"
+#
+#     file = microphone if microphone is not None else file_upload
+#
+#     text = pipe(file)["text"]
+#
+#     return warn_output + text
+# YouTubeの埋め込みプレーヤーを表示するHTMLコードを生成する
+# def _return_yt_html_embed(yt_url):
+#     video_id = yt_url.split("?v=")[-1]
+#     HTML_str = (
+#         f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
+#         " </center>"
+#     )
+#     return HTML_str
+# YouTubeのビデオから音声をダウンロードし、音声データを使用して転写を行う
+# def yt_transcribe(yt_url):
+#     # yt = YouTube(yt_url)
+#     # html_embed_str = _return_yt_html_embed(yt_url)
+#     # stream = yt.streams.filter(only_audio=True)[0]
+#     # stream.download(filename="audio.mp3")
+#
+#     ydl_opts = {
+#         'format': 'bestvideo*+bestaudio/best',
+#         'postprocessors': [{
+#             'key': 'FFmpegExtractAudio',
+#             'preferredcodec': 'mp3',
+#             'preferredquality': '192',
+#         }],
+#         'outtmpl': 'audio.%(ext)s',
+#     }
+#
+#     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+#         ydl.download([yt_url])
+#
+#     text = pipe("audio.mp3")["text"]
+#     return html_embed_str, text
+# 秒数を時刻表記に変換
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
+# YouTubeのビデオをダウンロードする
+# def get_youtube(video_url):
+#     # yt = YouTube(video_url)
+#     # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+#
+#     ydl_opts = {
+#         'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+#     }
+#
+#     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+#         info = ydl.extract_info(video_url, download=False)
+#         abs_video_path = ydl.prepare_filename(info)
+#         ydl.process_info(info)
+#
+#     print("Success download video")
+#     print(abs_video_path)
+#     return abs_video_path
+# 音声をテキストに変換
 def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
 demo.encrypt = False
 with demo:
+    with gr.Tab("Whisper speaker diarization"):
+        # gr.Markdown('''
+        #     <div>
+        #     <h1 style='text-align: center'>Whisper speaker diarization</h1>
+        #     This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
+        #     and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
+        #     </div>
+        # ''')
+        #
+        # with gr.Row():
+        #     gr.Markdown('''
+        #     ### Transcribe youtube link using OpenAI Whisper
+        #     ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
+        #     ##### 2. Generating speaker embeddings for each segments.
+        #     ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
+        #     ''')
+        #
+        # with gr.Row():
+        #     gr.Markdown('''
+        #         ### You can test by following examples:
+        #         ''')
+        # examples = gr.Examples(examples=
+        #                        ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
+        #                         "https://www.youtube.com/watch?v=-UX0X45sYe4",
+        #                         "https://www.youtube.com/watch?v=7minSgqi-Gw"],
+        #                        label="Examples", inputs=[youtube_url_in])
+        #
+        # with gr.Row():
+        #     with gr.Column():
+        #         youtube_url_in.render()
+        #         download_youtube_btn = gr.Button("Download Youtube video")
+        #         download_youtube_btn.click(get_youtube, [youtube_url_in], [
+        #             video_in])
+        #         print(video_in)
+        with gr.Row():
             with gr.Column():
+                video_in.render()
+                with gr.Column():
+                    gr.Markdown('''
+                    ##### Here you can start the transcription process.
+                    ##### Please select the source language for transcription.
+                    ##### You can select a range of assumed numbers of speakers.
+                    ''')
+                selected_source_lang.render()
+                selected_whisper_model.render()
+                number_speakers.render()
+                transcribe_btn = gr.Button("Transcribe audio and diarization")
+                transcribe_btn.click(speech_to_text,
+                                     [video_in, selected_source_lang, selected_whisper_model, number_speakers],
+                                     [transcription_df, system_info, download_transcript]
+                                     )
+        with gr.Row():
+            gr.Markdown('''
+            ##### Here you will get transcription  output
+            ##### ''')
+        with gr.Row():
+            with gr.Column():
+                download_transcript.render()
+                transcription_df.render()
+                # system_info.render()
+                # gr.Markdown(
+                #     '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
+    # with gr.Tab("Whisper Transcribe Japanese Audio"):
+    #     gr.Markdown(f'''
+    #           <div>
+    #           <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
+    #           </div>
+    #           Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
+    #           checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
+    #       ''')
+    #     microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
+    #     upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
+    #     transcribe_btn = gr.Button("Transcribe Audio")
+    #     text_output = gr.Textbox()
+    #     with gr.Row():
+    #         gr.Markdown('''
+    #             ### You can test by following examples:
+    #             ''')
+    #     examples = gr.Examples(examples=
+    #                            ["sample1.wav",
+    #                             "sample2.wav",
+    #                             ],
+    #                            label="Examples", inputs=[upload])
+    #     transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
+    #
+    # with gr.Tab("Whisper Transcribe Japanese YouTube"):
+    #     gr.Markdown(f'''
+    #           <div>
+    #           <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
+    #           </div>
+    #             Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
+    #             <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
+    #         ''')
+    #     youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+    #     yt_transcribe_btn = gr.Button("Transcribe YouTube")
+    #     text_output2 = gr.Textbox()
+    #     html_output = gr.Markdown()
+    #     yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
 demo.launch(debug=True)