ajayarora1235 commited on
Commit
4be57d5
·
1 Parent(s): 396bb36

fix hubert last

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -1
  2. app.py +67 -60
  3. audios/happy demo.wav +0 -0
  4. hubert_base_hf_statedict.pt +3 -0
.gitattributes CHANGED
@@ -2,4 +2,4 @@ ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
2
  ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
3
  pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
4
  pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
5
- hubert.pth filter=lfs diff=lfs merge=lfs -text
 
2
  ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
3
  pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
4
  pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
5
+ hubert_base_hf_statedict.pt filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -251,7 +251,7 @@ def load_hubert():
251
  configH= HubertConfig()
252
  configH.output_hidden_states = True
253
  hubert_model = HubertModel(configH)
254
- hubert_model.load_state_dict(torch.load('hubert.pth'))
255
  # Prepare the model
256
  hubert_model = hubert_model.to(config.device)
257
  if config.is_half:
@@ -1779,83 +1779,50 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1779
  but0 = gr.Button("Convert", variant="primary")
1780
  with gr.Row():
1781
  with gr.Column():
1782
- with gr.Row():
1783
- dropbox = gr.File(label="Drag your audio file and click refresh.")
1784
- with gr.Row():
1785
- record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
1786
  with gr.Row():
1787
  input_audio0 = gr.Dropdown(
1788
  label="2.Choose the audio file.",
1789
  value="./audios/Test_Audio.mp3",
1790
  choices=audio_files
1791
  )
1792
- dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
1793
- dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
1794
  refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
1795
- record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
1796
- record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
 
 
 
1797
 
1798
  with gr.Row():
1799
- with gr.Column():
1800
- input_audio = gr.Audio(label="Input Audio", type="filepath")
1801
- # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
1802
- # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
1803
- # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
1804
- transcribed_text = gr.Textbox(label="transcibed text + mfa",
1805
- info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1806
- transcribe_info_text = gr.TextArea(label="How to use",
1807
- value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
1808
- transcribe_btn = gr.Button(value="transcribe and create mfa")
1809
- seed = gr.Number(label='seed', interactive=True, value=1)
1810
- stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
1811
- info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
1812
- sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
1813
- info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
1814
- left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
1815
- info=" not used for TTS, only for speech editing")
1816
- right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
1817
- info=" not used for TTS, only for speech editing")
1818
- codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
1819
- codec_sr = gr.Number(label='codec', interactive=True, value=50)
1820
- top_k = gr.Number(label='top_k', interactive=True, value=0)
1821
- top_p = gr.Number(label='top_p', interactive=True, value=0.8)
1822
- temperature = gr.Number(label='temperature', interactive=True, value=1)
1823
- kvcache = gr.Number(label='kvcache', interactive=True, value=1,
1824
- info='set to 0 to use less VRAM, results may be worse and slower inference')
1825
- silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
1826
 
1827
  with gr.Column():
 
1828
  output_audio_con = gr.Audio(label="Output Audio concatenated")
1829
  output_audio_gen = gr.Audio(label="Output Audio generated")
1830
  cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1831
  run_btn = gr.Button(value="run")
1832
  run_btn_joint = gr.Button(value="run with RVC")
1833
- target_transcript = gr.Textbox(label="target transcript")
1834
 
1835
- transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
1836
- outputs=[transcribed_text])
 
1837
 
1838
- run_btn.click(fn=run,
1839
- inputs=[
1840
- seed,
1841
- stop_repitition,
1842
- sample_batch_size,
1843
- left_margin,
1844
- right_margin,
1845
- codecaudio_sr,
1846
- codec_sr,
1847
- top_k,
1848
- top_p,
1849
- temperature,
1850
- kvcache,
1851
- cutoff_value,
1852
- target_transcript,
1853
- silence_tokens,
1854
- transcribed_text],
1855
- outputs=[
1856
- output_audio_con,
1857
- output_audio_gen
1858
- ])
1859
 
1860
  with gr.Column():
1861
  vc_output2 = gr.Audio(
@@ -1865,6 +1832,24 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1865
  )
1866
 
1867
  #with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1868
  with gr.Accordion("Index Settings", open=False):
1869
  #with gr.Row():
1870
 
@@ -1995,6 +1980,28 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1995
  with gr.Row():
1996
  vc_output1 = gr.Textbox("")
1997
  f0_file = gr.File(label="f0 file", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1998
 
1999
  but0.click(
2000
  vc_single,
 
251
  configH= HubertConfig()
252
  configH.output_hidden_states = True
253
  hubert_model = HubertModel(configH)
254
+ hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
255
  # Prepare the model
256
  hubert_model = hubert_model.to(config.device)
257
  if config.is_half:
 
1779
  but0 = gr.Button("Convert", variant="primary")
1780
  with gr.Row():
1781
  with gr.Column():
1782
+ # with gr.Row():
1783
+ # dropbox = gr.File(label="Drag your audio file and click refresh.")
1784
+ # with gr.Row():
1785
+ # record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
1786
  with gr.Row():
1787
  input_audio0 = gr.Dropdown(
1788
  label="2.Choose the audio file.",
1789
  value="./audios/Test_Audio.mp3",
1790
  choices=audio_files
1791
  )
1792
+ # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
1793
+ # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
1794
  refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
1795
+ transcribed_text = gr.Textbox(label="transcibed text + mfa",
1796
+ value="The dogs sat at the door."
1797
+ info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1798
+ # record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
1799
+ # record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
1800
 
1801
  with gr.Row():
1802
+ # with gr.Column():
1803
+ # input_audio = gr.Audio(label="Input Audio", type="filepath")
1804
+ # # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
1805
+ # # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
1806
+ # # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
1807
+ # transcribed_text = gr.Textbox(label="transcibed text + mfa",
1808
+ # info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1809
+ # transcribe_info_text = gr.TextArea(label="How to use",
1810
+ # value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
1811
+ # transcribe_btn = gr.Button(value="transcribe and create mfa")
1812
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1813
 
1814
  with gr.Column():
1815
+ target_transcript = gr.Textbox(label="target transcript")
1816
  output_audio_con = gr.Audio(label="Output Audio concatenated")
1817
  output_audio_gen = gr.Audio(label="Output Audio generated")
1818
  cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1819
  run_btn = gr.Button(value="run")
1820
  run_btn_joint = gr.Button(value="run with RVC")
 
1821
 
1822
+ # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
1823
+ # outputs=[transcribed_text])
1824
+
1825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1826
 
1827
  with gr.Column():
1828
  vc_output2 = gr.Audio(
 
1832
  )
1833
 
1834
  #with gr.Column():
1835
+ with gr.Accordion("Advanced TTS Settings", open=False):
1836
+ seed = gr.Number(label='seed', interactive=True, value=1)
1837
+ stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
1838
+ info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
1839
+ sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
1840
+ info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
1841
+ left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
1842
+ info=" not used for TTS, only for speech editing")
1843
+ right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
1844
+ info=" not used for TTS, only for speech editing")
1845
+ codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
1846
+ codec_sr = gr.Number(label='codec', interactive=True, value=50)
1847
+ top_k = gr.Number(label='top_k', interactive=True, value=0)
1848
+ top_p = gr.Number(label='top_p', interactive=True, value=0.8)
1849
+ temperature = gr.Number(label='temperature', interactive=True, value=1)
1850
+ kvcache = gr.Number(label='kvcache', interactive=True, value=1,
1851
+ info='set to 0 to use less VRAM, results may be worse and slower inference')
1852
+ silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
1853
  with gr.Accordion("Index Settings", open=False):
1854
  #with gr.Row():
1855
 
 
1980
  with gr.Row():
1981
  vc_output1 = gr.Textbox("")
1982
  f0_file = gr.File(label="f0 file", visible=False)
1983
+
1984
+ run_btn.click(fn=run,
1985
+ inputs=[
1986
+ seed,
1987
+ stop_repitition,
1988
+ sample_batch_size,
1989
+ left_margin,
1990
+ right_margin,
1991
+ codecaudio_sr,
1992
+ codec_sr,
1993
+ top_k,
1994
+ top_p,
1995
+ temperature,
1996
+ kvcache,
1997
+ cutoff_value,
1998
+ target_transcript,
1999
+ silence_tokens,
2000
+ transcribed_text],
2001
+ outputs=[
2002
+ output_audio_con,
2003
+ output_audio_gen
2004
+ ])
2005
 
2006
  but0.click(
2007
  vc_single,
audios/happy demo.wav ADDED
Binary file (116 kB). View file
 
hubert_base_hf_statedict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45005b220ba02f5491472e63bed8a4be2c8c22bf4ed27f983386f9279c5f506c
3
+ size 377560144