3417543_models

Paused

App Files Files Community

ajayarora1235 commited on Apr 3, 2024

Commit

4be57d5

1 Parent(s): 396bb36

fix hubert last

Browse files

Files changed (4) hide show

.gitattributes +1 -1
app.py +67 -60
audios/happy demo.wav +0 -0
hubert_base_hf_statedict.pt +3 -0

.gitattributes CHANGED Viewed

@@ -2,4 +2,4 @@ ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
 ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
 pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
 pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
-hubert.pth filter=lfs diff=lfs merge=lfs -text

 ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
 pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
 pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
+hubert_base_hf_statedict.pt filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -251,7 +251,7 @@ def load_hubert():
     configH= HubertConfig()
     configH.output_hidden_states = True
     hubert_model = HubertModel(configH)
-    hubert_model.load_state_dict(torch.load('hubert.pth'))
     # Prepare the model
     hubert_model = hubert_model.to(config.device)
     if config.is_half:
@@ -1779,83 +1779,50 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                 but0 = gr.Button("Convert", variant="primary")
             with gr.Row():
                 with gr.Column():
-                    with gr.Row():
-                        dropbox = gr.File(label="Drag your audio file and click refresh.")
-                    with gr.Row():
-                        record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
                     with gr.Row():
                         input_audio0 = gr.Dropdown(
                             label="2.Choose the audio file.",
                             value="./audios/Test_Audio.mp3",
                             choices=audio_files
                             )
-                        dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
-                        dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
                         refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
-                        record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
-                        record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
                     with gr.Row():
-                        with gr.Column():
-                            input_audio = gr.Audio(label="Input Audio", type="filepath")
-                            # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
-                            #                                 choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
-                            #                                 info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
-                            transcribed_text = gr.Textbox(label="transcibed text + mfa",
-                                                        info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
-                            transcribe_info_text = gr.TextArea(label="How to use",
-                                                            value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
-                            transcribe_btn = gr.Button(value="transcribe and create mfa")
-                            seed = gr.Number(label='seed', interactive=True, value=1)
-                            stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
-                                                    info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
-                            sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
-                                                        info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
-                            left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
-                                                    info=" not used for TTS, only for speech editing")
-                            right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
-                                                    info=" not used for TTS, only for speech editing")
-                            codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
-                            codec_sr = gr.Number(label='codec', interactive=True, value=50)
-                            top_k = gr.Number(label='top_k', interactive=True, value=0)
-                            top_p = gr.Number(label='top_p', interactive=True, value=0.8)
-                            temperature = gr.Number(label='temperature', interactive=True, value=1)
-                            kvcache = gr.Number(label='kvcache', interactive=True, value=1,
-                                                info='set to 0 to use less VRAM, results may be worse and slower inference')
-                            silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
                         with gr.Column():
                             output_audio_con = gr.Audio(label="Output Audio concatenated")
                             output_audio_gen = gr.Audio(label="Output Audio generated")
                             cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
                             run_btn = gr.Button(value="run")
                             run_btn_joint = gr.Button(value="run with RVC")
-                            target_transcript = gr.Textbox(label="target transcript")
-                        transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
-                                             outputs=[transcribed_text])
-                        run_btn.click(fn=run,
-                                    inputs=[
-                                        seed,
-                                        stop_repitition,
-                                        sample_batch_size,
-                                        left_margin,
-                                        right_margin,
-                                        codecaudio_sr,
-                                        codec_sr,
-                                        top_k,
-                                        top_p,
-                                        temperature,
-                                        kvcache,
-                                        cutoff_value,
-                                        target_transcript,
-                                        silence_tokens,
-                                        transcribed_text],
-                                    outputs=[
-                                        output_audio_con,
-                                        output_audio_gen
-                                    ])
                 with gr.Column():
                     vc_output2 = gr.Audio(
@@ -1865,6 +1832,24 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                     )
                 #with gr.Column():
                     with gr.Accordion("Index Settings", open=False):
                         #with gr.Row():
@@ -1995,6 +1980,28 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
             with gr.Row():
                 vc_output1 = gr.Textbox("")
                 f0_file = gr.File(label="f0 file", visible=False)
                 but0.click(
                     vc_single,

     configH= HubertConfig()
     configH.output_hidden_states = True
     hubert_model = HubertModel(configH)
+    hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
     # Prepare the model
     hubert_model = hubert_model.to(config.device)
     if config.is_half:
                 but0 = gr.Button("Convert", variant="primary")
             with gr.Row():
                 with gr.Column():
+                    # with gr.Row():
+                    #     dropbox = gr.File(label="Drag your audio file and click refresh.")
+                    # with gr.Row():
+                    #     record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
                     with gr.Row():
                         input_audio0 = gr.Dropdown(
                             label="2.Choose the audio file.",
                             value="./audios/Test_Audio.mp3",
                             choices=audio_files
                             )
+                        # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
+                        # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
                         refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
+                        transcribed_text = gr.Textbox(label="transcibed text + mfa",
+                                                      value="The dogs sat at the door."
+                                                    info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
+                        # record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
+                        # record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
                     with gr.Row():
+                        # with gr.Column():
+                            # input_audio = gr.Audio(label="Input Audio", type="filepath")
+                            # # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
+                            # #                                 choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
+                            # #                                 info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
+                            # transcribed_text = gr.Textbox(label="transcibed text + mfa",
+                            #                             info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
+                            # transcribe_info_text = gr.TextArea(label="How to use",
+                            #                                 value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
+                            # transcribe_btn = gr.Button(value="transcribe and create mfa")
                         with gr.Column():
+                            target_transcript = gr.Textbox(label="target transcript")
                             output_audio_con = gr.Audio(label="Output Audio concatenated")
                             output_audio_gen = gr.Audio(label="Output Audio generated")
                             cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
                             run_btn = gr.Button(value="run")
                             run_btn_joint = gr.Button(value="run with RVC")
+                        # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
+                        #                      outputs=[transcribed_text])
                 with gr.Column():
                     vc_output2 = gr.Audio(
                     )
                 #with gr.Column():
+                    with gr.Accordion("Advanced TTS Settings", open=False):
+                        seed = gr.Number(label='seed', interactive=True, value=1)
+                        stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
+                                                info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
+                        sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
+                                                    info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
+                        left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
+                                                info=" not used for TTS, only for speech editing")
+                        right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
+                                                info=" not used for TTS, only for speech editing")
+                        codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
+                        codec_sr = gr.Number(label='codec', interactive=True, value=50)
+                        top_k = gr.Number(label='top_k', interactive=True, value=0)
+                        top_p = gr.Number(label='top_p', interactive=True, value=0.8)
+                        temperature = gr.Number(label='temperature', interactive=True, value=1)
+                        kvcache = gr.Number(label='kvcache', interactive=True, value=1,
+                                            info='set to 0 to use less VRAM, results may be worse and slower inference')
+                        silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
                     with gr.Accordion("Index Settings", open=False):
                         #with gr.Row():
             with gr.Row():
                 vc_output1 = gr.Textbox("")
                 f0_file = gr.File(label="f0 file", visible=False)
+                run_btn.click(fn=run,
+                            inputs=[
+                                seed,
+                                stop_repitition,
+                                sample_batch_size,
+                                left_margin,
+                                right_margin,
+                                codecaudio_sr,
+                                codec_sr,
+                                top_k,
+                                top_p,
+                                temperature,
+                                kvcache,
+                                cutoff_value,
+                                target_transcript,
+                                silence_tokens,
+                                transcribed_text],
+                            outputs=[
+                                output_audio_con,
+                                output_audio_gen
+                            ])
                 but0.click(
                     vc_single,

audios/happy demo.wav ADDED Viewed

Binary file (116 kB). View file

hubert_base_hf_statedict.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45005b220ba02f5491472e63bed8a4be2c8c22bf4ed27f983386f9279c5f506c
+size 377560144