3417543_models

Paused

App Files Files Community

ajayarora1235 commited on Apr 3, 2024

Commit

4d7a16e

1 Parent(s): 6736ecf

final app for clas

Browse files

Files changed (1) hide show

app.py +79 -72

app.py CHANGED Viewed

@@ -1398,7 +1398,7 @@ def download_from_url(url, model, associated_user=None):
     os.makedirs("unzips", exist_ok=True)
     zipfile = model + '.zip'
     zipfile_path = './zips/' + zipfile
-    return
     try:
         if "drive.google.com" in url or "drive.usercontent.google.com":
             subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
@@ -1483,7 +1483,8 @@ def transcribe_btn_click(audio_choice):
     transcript_fn = f"{temp_folder}/{filename}.txt"
     if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
         print("Audio and transcript already exist, skipping transcript")
-        return
     batch_size = 1  # Adjust based on your GPU memory availability
     compute_type = "float16"
@@ -1523,24 +1524,24 @@ def transcribe_btn_click(audio_choice):
     return result
-def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
         temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
     global voicecraft_model, voicecraft_config, phn2num
-    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-    os.environ["USER"] = "USER"
     print("Transcribing the input audio")
-    transcribe_btn_click(input_audio_fn)
     print("Transcription complete")
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
     print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
@@ -1584,7 +1585,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
     return [seg_save_fn_concat, seg_save_fn_gen]
 def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
-        temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
         sid,
         f0_up_key,
         f0_file,
@@ -1601,19 +1602,20 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
     global voicecraft_model, voicecraft_config, phn2num
     print("Transcribing the input audio")
-    transcribe_btn_click(input_audio_fn)
-    print("Transcription complete")
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
-    cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + ' ' + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)
@@ -1711,7 +1713,7 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
             times[0],
             times[1],
             times[2],
-        ), (tgt_sr, audio_opt)
     except:
         info = traceback.format_exc()
         print(info)
@@ -1803,22 +1805,25 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                 with gr.Column():
                     # with gr.Row():
                     #     dropbox = gr.File(label="Drag your audio file and click refresh.")
-                    # with gr.Row():
-                    #     record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
                     with gr.Row():
                         input_audio0 = gr.Dropdown(
                             label="2.Choose the audio file.",
                             value="./audios/Test_Audio.mp3",
                             choices=audio_files
-                            )
                         # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
                         # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
                         refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
-                        transcribed_text = gr.Textbox(label="transcibed text + mfa",
-                                                      value="The dogs sat at the door.",
-                                                    info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
-                        # record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
-                        # record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
                     with gr.Row():
                         # with gr.Column():
@@ -1835,11 +1840,6 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                         with gr.Column():
                             target_transcript = gr.Textbox(label="target transcript")
-                            output_audio_con = gr.Audio(label="Output Audio concatenated")
-                            output_audio_gen = gr.Audio(label="Output Audio generated")
-                            cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
-                            run_btn = gr.Button(value="run")
-                            run_btn_joint = gr.Button(value="run with RVC")
                         # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
                         #                      outputs=[transcribed_text])
@@ -1847,8 +1847,16 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                 with gr.Column():
                     vc_output2 = gr.Audio(
-                        label="Final Result! (Click on the three dots to download the audio)",
                         type='filepath',
                         interactive=False,
                     )
@@ -2003,50 +2011,51 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                 vc_output1 = gr.Textbox("")
                 f0_file = gr.File(label="f0 file", visible=False)
-                run_btn.click(fn=run,
-                            inputs=[
-                                seed,
-                                stop_repitition,
-                                sample_batch_size,
-                                left_margin,
-                                right_margin,
-                                codecaudio_sr,
-                                codec_sr,
-                                top_k,
-                                top_p,
-                                temperature,
-                                kvcache,
-                                cutoff_value,
-                                target_transcript,
-                                silence_tokens,
-                                transcribed_text],
-                            outputs=[
-                                output_audio_con,
-                                output_audio_gen
-                            ])
-                but0.click(
-                    vc_single,
-                    [
-                        spk_item,
-                        input_audio0,
-                        vc_transform0,
-                        f0_file,
-                        f0method0,
-                        file_index1,
-                        # file_index2,
-                        # file_big_npy1,
-                        index_rate1,
-                        filter_radius0,
-                        resample_sr0,
-                        rms_mix_rate0,
-                        protect0,
-                        crepe_hop_length
-                    ],
-                    [vc_output1, vc_output2],
-                )
-                run_btn_joint.click(
                     fn=run_joint,
                     inputs=[
                         input_audio0,
@@ -2061,10 +2070,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                         top_p,
                         temperature,
                         kvcache,
-                        cutoff_value,
                         target_transcript,
                         silence_tokens,
-                        transcribed_text,
                         spk_item,
                         vc_transform0,
                         f0_file,
@@ -2079,7 +2086,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                         protect0,
                         crepe_hop_length
                     ],
-                    outputs=[vc_output1, vc_output2])
             with gr.Accordion("Batch Conversion",open=False, visible=False):
                 with gr.Row():

     os.makedirs("unzips", exist_ok=True)
     zipfile = model + '.zip'
     zipfile_path = './zips/' + zipfile
     try:
         if "drive.google.com" in url or "drive.usercontent.google.com":
             subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
     transcript_fn = f"{temp_folder}/{filename}.txt"
     if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
         print("Audio and transcript already exist, skipping transcript")
+        transcript = open(transcript_fn, "r").read()
+        return transcript
     batch_size = 1  # Adjust based on your GPU memory availability
     compute_type = "float16"
     return result
+def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
         temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
     global voicecraft_model, voicecraft_config, phn2num
     print("Transcribing the input audio")
+    transcribed_text = transcribe_btn_click(input_audio_fn)
     print("Transcription complete")
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
+    print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
     print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     return [seg_save_fn_concat, seg_save_fn_gen]
 def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
+        temperature, kvcache, target_transcript, silence_tokens,
         sid,
         f0_up_key,
         f0_file,
     global voicecraft_model, voicecraft_config, phn2num
     print("Transcribing the input audio")
+    transcribed_text = transcribe_btn_click(input_audio_fn)
+    print("Transcription complete", transcribed_text)
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+    # cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + ' ' + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
+    cut_off_sec = audio_dur - 0.1
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)
             times[0],
             times[1],
             times[2],
+        ), seg_save_fn_gen, (tgt_sr, audio_opt)
     except:
         info = traceback.format_exc()
         print(info)
                 with gr.Column():
                     # with gr.Row():
                     #     dropbox = gr.File(label="Drag your audio file and click refresh.")
+                    with gr.Row():
+                        record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
                     with gr.Row():
                         input_audio0 = gr.Dropdown(
                             label="2.Choose the audio file.",
                             value="./audios/Test_Audio.mp3",
                             choices=audio_files
+                        )
+                        audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
                         # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
                         # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
                         refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
+                        # transcribed_text = gr.Textbox(label="transcibed text + mfa",
+                        #                               value="The dogs sat at the door.",
+                        #                             info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
+                        record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
+                        record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
+                        # update audio_display
+                        input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
                     with gr.Row():
                         # with gr.Column():
                         with gr.Column():
                             target_transcript = gr.Textbox(label="target transcript")
                         # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
                         #                      outputs=[transcribed_text])
                 with gr.Column():
+                    output_audio_gen = gr.Audio(
+                        label="Output Audio generated",
+                        type='filepath',
+                        interactive=False
+                    )
                     vc_output2 = gr.Audio(
+                        label="Voice converted! (Click on the three dots to download the audio)",
                         type='filepath',
                         interactive=False,
                     )
                 vc_output1 = gr.Textbox("")
                 f0_file = gr.File(label="f0 file", visible=False)
+                # run_btn.click(fn=run,
+                #             inputs=[
+                #                 input_audio0,
+                #                 seed,
+                #                 stop_repitition,
+                #                 sample_batch_size,
+                #                 left_margin,
+                #                 right_margin,
+                #                 codecaudio_sr,
+                #                 codec_sr,
+                #                 top_k,
+                #                 top_p,
+                #                 temperature,
+                #                 kvcache,
+                #                 cutoff_value,
+                #                 target_transcript,
+                #                 silence_tokens,
+                #                 transcribed_text],
+                #             outputs=[
+                #                 output_audio_con,
+                #                 output_audio_gen
+                #             ])
+                # but0.click(
+                #     vc_single,
+                #     [
+                #         spk_item,
+                #         input_audio0,
+                #         vc_transform0,
+                #         f0_file,
+                #         f0method0,
+                #         file_index1,
+                #         # file_index2,
+                #         # file_big_npy1,
+                #         index_rate1,
+                #         filter_radius0,
+                #         resample_sr0,
+                #         rms_mix_rate0,
+                #         protect0,
+                #         crepe_hop_length
+                #     ],
+                #     [vc_output1, vc_output2],
+                # )
+                but0.click(
                     fn=run_joint,
                     inputs=[
                         input_audio0,
                         top_p,
                         temperature,
                         kvcache,
                         target_transcript,
                         silence_tokens,
                         spk_item,
                         vc_transform0,
                         f0_file,
                         protect0,
                         crepe_hop_length
                     ],
+                    outputs=[vc_output1, output_audio_gen, vc_output2])
             with gr.Accordion("Batch Conversion",open=False, visible=False):
                 with gr.Row():