Spaces:

yamashiro3
/

Whisper-gpt-voicescribe

Build error

App Files Files Community

yama commited on Jun 30, 2023

Commit

1de0e6b

1 Parent(s): bef704a

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -148

app.py CHANGED Viewed

@@ -1,35 +1,43 @@
 # import whisper
 from faster_whisper import WhisperModel
 import datetime
-import subprocess
 import gradio as gr
-from pathlib import Path
 import pandas as pd
-import re
 import time
 import os
 import numpy as np
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.metrics import silhouette_score
-from pytube import YouTube
-import yt_dlp
 import torch
 import pyannote.audio
 from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 from pyannote.audio import Audio
 from pyannote.core import Segment
-from gpuinfo import GPUInfo
 import wave
 import contextlib
 from transformers import pipeline
-import psutil
 whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
-    "ja": "Japanese",
     "en": "English",
     # "zh": "Chinese",
     # "de": "German",
@@ -37,6 +45,7 @@ source_languages = {
     # "ru": "Russian",
     # "ko": "Korean",
     # "fr": "French",
     # "pt": "Portuguese",
     # "tr": "Turkish",
     # "pl": "Polish",
@@ -136,6 +145,7 @@ MODEL_NAME = "vumichien/whisper-medium-jp"
 lang = "ja"
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
@@ -147,10 +157,10 @@ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(lan
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
-    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-# 音声データの転記
 # def transcribe(microphone, file_upload):
 #     warn_output = ""
 #     if (microphone is not None) and (file_upload is not None):
@@ -169,7 +179,6 @@ embedding_model = PretrainedSpeakerEmbedding(
 #     return warn_output + text
-# YouTubeの埋め込みプレーヤーを表示するHTMLコードを生成する
 # def _return_yt_html_embed(yt_url):
 #     video_id = yt_url.split("?v=")[-1]
 #     HTML_str = (
@@ -179,7 +188,6 @@ embedding_model = PretrainedSpeakerEmbedding(
 #     return HTML_str
-# YouTubeのビデオから音声をダウンロードし、音声データを使用して転写を行う
 # def yt_transcribe(yt_url):
 #     # yt = YouTube(yt_url)
 #     # html_embed_str = _return_yt_html_embed(yt_url)
@@ -203,12 +211,10 @@ embedding_model = PretrainedSpeakerEmbedding(
 #     return html_embed_str, text
-# 秒数を時刻表記に変換
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
-# YouTubeのビデオをダウンロードする
 # def get_youtube(video_url):
 #     # yt = YouTube(video_url)
 #     # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
@@ -226,7 +232,7 @@ def convert_time(secs):
 #     print(abs_video_path)
 #     return abs_video_path
-# 音声をテキストに変換
 def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
@@ -338,148 +344,110 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         time_end = time.time()
         time_diff = time_end - time_start
-        memory = psutil.virtual_memory()
-        gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
-        gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
-        gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
-        system_info = f"""
-        *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
-        *Processing time: {time_diff:.5} seconds.*
-        *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
-        """
         save_path = "output/transcript_result.csv"
         df_results = pd.DataFrame(objects)
         df_results.to_csv(save_path)
-        return df_results, system_info, save_path
     except Exception as e:
         raise RuntimeError("Error Running inference with local model", e)
 # ---- Gradio Layout -----
-# Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
 video_in = gr.Video(label="Video file", mirror_webcam=False)
-youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
-memory = psutil.virtual_memory()
-selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="ja",
-                                   label="Spoken language in video", interactive=True)
-selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model",
-                                     interactive=True)
-number_speakers = gr.Number(precision=0, value=0,
-                            label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers",
-                            interactive=True)
-system_info = gr.Markdown(
-    f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 download_transcript = gr.File(label="Download transcript")
-transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=(0, "dynamic"), max_rows=10,
-                                wrap=True, overflow_row_behaviour='paginate')
-title = "Whisper speaker diarization"
-demo = gr.Blocks(title=title)
-demo.encrypt = False
-with demo:
-    with gr.Tab("Whisper speaker diarization"):
-        # gr.Markdown('''
-        #     <div>
-        #     <h1 style='text-align: center'>Whisper speaker diarization</h1>
-        #     This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
-        #     and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
-        #     </div>
-        # ''')
-        #
-        # with gr.Row():
-        #     gr.Markdown('''
-        #     ### Transcribe youtube link using OpenAI Whisper
-        #     ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
-        #     ##### 2. Generating speaker embeddings for each segments.
-        #     ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
-        #     ''')
-        #
-        # with gr.Row():
-        #     gr.Markdown('''
-        #         ### You can test by following examples:
-        #         ''')
-        # examples = gr.Examples(examples=
-        #                        ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
-        #                         "https://www.youtube.com/watch?v=-UX0X45sYe4",
-        #                         "https://www.youtube.com/watch?v=7minSgqi-Gw"],
-        #                        label="Examples", inputs=[youtube_url_in])
-        #
-        # with gr.Row():
-        #     with gr.Column():
-        #         youtube_url_in.render()
-        #         download_youtube_btn = gr.Button("Download Youtube video")
-        #         download_youtube_btn.click(get_youtube, [youtube_url_in], [
-        #             video_in])
-        #         print(video_in)
-        with gr.Row():
-            with gr.Column():
-                video_in.render()
-                with gr.Column():
-                    gr.Markdown('''
-                    ##### Here you can start the transcription process.
-                    ##### Please select the source language for transcription.
-                    ##### You can select a range of assumed numbers of speakers.
-                    ''')
-                selected_source_lang.render()
-                selected_whisper_model.render()
-                number_speakers.render()
-                transcribe_btn = gr.Button("Transcribe audio and diarization")
-                transcribe_btn.click(speech_to_text,
-                                     [video_in, selected_source_lang, selected_whisper_model, number_speakers],
-                                     [transcription_df, system_info, download_transcript]
-                                     )
-        with gr.Row():
-            gr.Markdown('''
-            ##### Here you will get transcription  output
-            ##### ''')
-        with gr.Row():
-            with gr.Column():
-                download_transcript.render()
-                transcription_df.render()
-                # system_info.render()
-                # gr.Markdown(
-                #     '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
-    # with gr.Tab("Whisper Transcribe Japanese Audio"):
-    #     gr.Markdown(f'''
-    #           <div>
-    #           <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
-    #           </div>
-    #           Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
-    #           checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
-    #       ''')
-    #     microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
-    #     upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
-    #     transcribe_btn = gr.Button("Transcribe Audio")
-    #     text_output = gr.Textbox()
-    #     with gr.Row():
-    #         gr.Markdown('''
-    #             ### You can test by following examples:
-    #             ''')
-    #     examples = gr.Examples(examples=
-    #                            ["sample1.wav",
-    #                             "sample2.wav",
-    #                             ],
-    #                            label="Examples", inputs=[upload])
-    #     transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
-    #
-    # with gr.Tab("Whisper Transcribe Japanese YouTube"):
-    #     gr.Markdown(f'''
-    #           <div>
-    #           <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
-    #           </div>
-    #             Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
-    #             <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
-    #         ''')
-    #     youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
-    #     yt_transcribe_btn = gr.Button("Transcribe YouTube")
-    #     text_output2 = gr.Textbox()
-    #     html_output = gr.Markdown()
-    #     yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
-demo.launch(debug=True)

 # import whisper
 from faster_whisper import WhisperModel
 import datetime
+# import subprocess
 import gradio as gr
+# from pathlib import Path
 import pandas as pd
+# import re
 import time
 import os
 import numpy as np
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.metrics import silhouette_score
+# from pytube import YouTube
+# import yt_dlp
 import torch
 import pyannote.audio
 from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 from pyannote.audio import Audio
 from pyannote.core import Segment
+# from gpuinfo import GPUInfo
 import wave
 import contextlib
 from transformers import pipeline
+# import psutil
+# import gradio as gr
+import openai
+import os
+# from io import BytesIO
+import tempfile
+from pydub import AudioSegment
+# import shutil
 whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
     "en": "English",
     # "zh": "Chinese",
     # "de": "German",
     # "ru": "Russian",
     # "ko": "Korean",
     # "fr": "French",
+    "ja": "Japanese",
     # "pt": "Portuguese",
     # "tr": "Turkish",
     # "pl": "Polish",
 lang = "ja"
 device = 0 if torch.cuda.is_available() else "cpu"
+# device = "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
+    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+    )
 # def transcribe(microphone, file_upload):
 #     warn_output = ""
 #     if (microphone is not None) and (file_upload is not None):
 #     return warn_output + text
 # def _return_yt_html_embed(yt_url):
 #     video_id = yt_url.split("?v=")[-1]
 #     HTML_str = (
 #     return HTML_str
 # def yt_transcribe(yt_url):
 #     # yt = YouTube(yt_url)
 #     # html_embed_str = _return_yt_html_embed(yt_url)
 #     return html_embed_str, text
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
 # def get_youtube(video_url):
 #     # yt = YouTube(video_url)
 #     # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
 #     print(abs_video_path)
 #     return abs_video_path
 def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
         time_end = time.time()
         time_diff = time_end - time_start
+        # memory = psutil.virtual_memory()
+        # gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
+        # gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
+        # gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
+        # system_info = f"""
+        # *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
+        # *Processing time: {time_diff:.5} seconds.*
+        # *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
+        # """
         save_path = "output/transcript_result.csv"
         df_results = pd.DataFrame(objects)
         df_results.to_csv(save_path)
+        # return df_results, system_info, save_path
+        return df_results, save_path
     except Exception as e:
         raise RuntimeError("Error Running inference with local model", e)
+def create_meeting_summary(openai_key, prompt, uploaded_audio, max_transcribe_seconds):
+    openai.api_key = openai_key
+    # 音声ファイルを開く
+    audio = AudioSegment.from_file(uploaded_audio)
+    # 文字起こしする音声データの上限を設定する
+    if len(audio) > int(max_transcribe_seconds) * 1000:
+        audio = audio[:int(max_transcribe_seconds) * 1000]
+    # ファイルサイズを削減するために音声ファイルを圧縮する
+    compressed_audio = audio.set_frame_rate(16000).set_channels(1)
+    # 圧縮した音声ファイルをmp3形式で一時ファイルに保存する
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as tmp:
+        compressed_audio.export(tmp.name, format="mp3")
+        transcript = openai.Audio.transcribe("whisper-1", open(tmp.name, "rb"), response_format="verbose_json")
+        transcript_text = ""
+        for segment in transcript.segments:
+            transcript_text += f"{segment['text']}\n"
+    system_template = prompt
+    completion = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": system_template},
+            {"role": "user", "content": transcript_text}
+        ]
+    )
+    summary = completion.choices[0].message.content
+    return summary, transcript_text
 # ---- Gradio Layout -----
 video_in = gr.Video(label="Video file", mirror_webcam=False)
+# youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
+# memory = psutil.virtual_memory()
+selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="ja", label="Spoken language in video", interactive=True)
+selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
+number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
+# system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 download_transcript = gr.File(label="Download transcript")
+transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
+# title = "Whisper speaker diarization"
+# demo = gr.Blocks(title=title)
+# demo.encrypt = False
+inputs = [
+    gr.Textbox(lines=1, label="openai_key", type="password"),
+    gr.TextArea(label="summary prompt", value="""会議の文字起こしが渡されます。
+    この会議のサマリーをMarkdown形式で作成してください。サマリーは、以下のような形式で書いてください。
+    - 会議の目的
+    - 会議の内容
+    - 会議の結果
+    """),
+    # gr.Audio(type="filepath", label="音声ファイルをアップロード"),
+    video_in.render(),
+    gr.Textbox(lines=1, label="maximum transcription time (seconds)", type="text"),
+    selected_source_lang.render(),
+    selected_whisper_model.render(),
+    number_speakers.render(),
+]
+outputs = [
+    gr.Textbox(label="会議サマリー"),
+    gr.Textbox(label="文字起こし")
+]
+app = gr.Interface(
+    fn=create_meeting_summary,
+    inputs=inputs,
+    outputs=outputs,
+    title="会議サマリー生成アプリ",
+    description="音声ファイルをアップロードして、会議のサマリーをMarkdown形式で作成します。"
+)
+transcribe_btn = gr.Button("Transcribe audio and diarization")
+transcribe_btn.click(speech_to_text,
+                     [video_in, selected_source_lang, selected_whisper_model, number_speakers],
+                     [transcription_df, download_transcript]
+                     )
+app.launch(debug=True)