ajayarora1235
commited on
Commit
·
4be57d5
1
Parent(s):
396bb36
fix hubert last
Browse files- .gitattributes +1 -1
- app.py +67 -60
- audios/happy demo.wav +0 -0
- hubert_base_hf_statedict.pt +3 -0
.gitattributes
CHANGED
@@ -2,4 +2,4 @@ ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
|
|
2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
3 |
pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
|
4 |
pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
|
5 |
-
|
|
|
2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
3 |
pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
|
4 |
pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
|
5 |
+
hubert_base_hf_statedict.pt filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -251,7 +251,7 @@ def load_hubert():
|
|
251 |
configH= HubertConfig()
|
252 |
configH.output_hidden_states = True
|
253 |
hubert_model = HubertModel(configH)
|
254 |
-
hubert_model.load_state_dict(torch.load('
|
255 |
# Prepare the model
|
256 |
hubert_model = hubert_model.to(config.device)
|
257 |
if config.is_half:
|
@@ -1779,83 +1779,50 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1779 |
but0 = gr.Button("Convert", variant="primary")
|
1780 |
with gr.Row():
|
1781 |
with gr.Column():
|
1782 |
-
with gr.Row():
|
1783 |
-
|
1784 |
-
with gr.Row():
|
1785 |
-
|
1786 |
with gr.Row():
|
1787 |
input_audio0 = gr.Dropdown(
|
1788 |
label="2.Choose the audio file.",
|
1789 |
value="./audios/Test_Audio.mp3",
|
1790 |
choices=audio_files
|
1791 |
)
|
1792 |
-
dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
1793 |
-
dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1794 |
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
1795 |
-
|
1796 |
-
|
|
|
|
|
|
|
1797 |
|
1798 |
with gr.Row():
|
1799 |
-
with gr.Column():
|
1800 |
-
input_audio = gr.Audio(label="Input Audio", type="filepath")
|
1801 |
-
# transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
|
1802 |
-
# choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
|
1803 |
-
# info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
|
1804 |
-
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1805 |
-
|
1806 |
-
transcribe_info_text = gr.TextArea(label="How to use",
|
1807 |
-
|
1808 |
-
transcribe_btn = gr.Button(value="transcribe and create mfa")
|
1809 |
-
|
1810 |
-
stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
|
1811 |
-
info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
|
1812 |
-
sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
|
1813 |
-
info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
|
1814 |
-
left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
|
1815 |
-
info=" not used for TTS, only for speech editing")
|
1816 |
-
right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
|
1817 |
-
info=" not used for TTS, only for speech editing")
|
1818 |
-
codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
|
1819 |
-
codec_sr = gr.Number(label='codec', interactive=True, value=50)
|
1820 |
-
top_k = gr.Number(label='top_k', interactive=True, value=0)
|
1821 |
-
top_p = gr.Number(label='top_p', interactive=True, value=0.8)
|
1822 |
-
temperature = gr.Number(label='temperature', interactive=True, value=1)
|
1823 |
-
kvcache = gr.Number(label='kvcache', interactive=True, value=1,
|
1824 |
-
info='set to 0 to use less VRAM, results may be worse and slower inference')
|
1825 |
-
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
|
1826 |
|
1827 |
with gr.Column():
|
|
|
1828 |
output_audio_con = gr.Audio(label="Output Audio concatenated")
|
1829 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
1830 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1831 |
run_btn = gr.Button(value="run")
|
1832 |
run_btn_joint = gr.Button(value="run with RVC")
|
1833 |
-
target_transcript = gr.Textbox(label="target transcript")
|
1834 |
|
1835 |
-
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
1836 |
-
|
|
|
1837 |
|
1838 |
-
run_btn.click(fn=run,
|
1839 |
-
inputs=[
|
1840 |
-
seed,
|
1841 |
-
stop_repitition,
|
1842 |
-
sample_batch_size,
|
1843 |
-
left_margin,
|
1844 |
-
right_margin,
|
1845 |
-
codecaudio_sr,
|
1846 |
-
codec_sr,
|
1847 |
-
top_k,
|
1848 |
-
top_p,
|
1849 |
-
temperature,
|
1850 |
-
kvcache,
|
1851 |
-
cutoff_value,
|
1852 |
-
target_transcript,
|
1853 |
-
silence_tokens,
|
1854 |
-
transcribed_text],
|
1855 |
-
outputs=[
|
1856 |
-
output_audio_con,
|
1857 |
-
output_audio_gen
|
1858 |
-
])
|
1859 |
|
1860 |
with gr.Column():
|
1861 |
vc_output2 = gr.Audio(
|
@@ -1865,6 +1832,24 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1865 |
)
|
1866 |
|
1867 |
#with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1868 |
with gr.Accordion("Index Settings", open=False):
|
1869 |
#with gr.Row():
|
1870 |
|
@@ -1995,6 +1980,28 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1995 |
with gr.Row():
|
1996 |
vc_output1 = gr.Textbox("")
|
1997 |
f0_file = gr.File(label="f0 file", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1998 |
|
1999 |
but0.click(
|
2000 |
vc_single,
|
|
|
251 |
configH= HubertConfig()
|
252 |
configH.output_hidden_states = True
|
253 |
hubert_model = HubertModel(configH)
|
254 |
+
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
|
255 |
# Prepare the model
|
256 |
hubert_model = hubert_model.to(config.device)
|
257 |
if config.is_half:
|
|
|
1779 |
but0 = gr.Button("Convert", variant="primary")
|
1780 |
with gr.Row():
|
1781 |
with gr.Column():
|
1782 |
+
# with gr.Row():
|
1783 |
+
# dropbox = gr.File(label="Drag your audio file and click refresh.")
|
1784 |
+
# with gr.Row():
|
1785 |
+
# record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
|
1786 |
with gr.Row():
|
1787 |
input_audio0 = gr.Dropdown(
|
1788 |
label="2.Choose the audio file.",
|
1789 |
value="./audios/Test_Audio.mp3",
|
1790 |
choices=audio_files
|
1791 |
)
|
1792 |
+
# dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
1793 |
+
# dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1794 |
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
1795 |
+
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1796 |
+
value="The dogs sat at the door."
|
1797 |
+
info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
1798 |
+
# record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
|
1799 |
+
# record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1800 |
|
1801 |
with gr.Row():
|
1802 |
+
# with gr.Column():
|
1803 |
+
# input_audio = gr.Audio(label="Input Audio", type="filepath")
|
1804 |
+
# # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
|
1805 |
+
# # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
|
1806 |
+
# # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
|
1807 |
+
# transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1808 |
+
# info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
1809 |
+
# transcribe_info_text = gr.TextArea(label="How to use",
|
1810 |
+
# value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
|
1811 |
+
# transcribe_btn = gr.Button(value="transcribe and create mfa")
|
1812 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1813 |
|
1814 |
with gr.Column():
|
1815 |
+
target_transcript = gr.Textbox(label="target transcript")
|
1816 |
output_audio_con = gr.Audio(label="Output Audio concatenated")
|
1817 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
1818 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1819 |
run_btn = gr.Button(value="run")
|
1820 |
run_btn_joint = gr.Button(value="run with RVC")
|
|
|
1821 |
|
1822 |
+
# transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
1823 |
+
# outputs=[transcribed_text])
|
1824 |
+
|
1825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1826 |
|
1827 |
with gr.Column():
|
1828 |
vc_output2 = gr.Audio(
|
|
|
1832 |
)
|
1833 |
|
1834 |
#with gr.Column():
|
1835 |
+
with gr.Accordion("Advanced TTS Settings", open=False):
|
1836 |
+
seed = gr.Number(label='seed', interactive=True, value=1)
|
1837 |
+
stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
|
1838 |
+
info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
|
1839 |
+
sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
|
1840 |
+
info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
|
1841 |
+
left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
|
1842 |
+
info=" not used for TTS, only for speech editing")
|
1843 |
+
right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
|
1844 |
+
info=" not used for TTS, only for speech editing")
|
1845 |
+
codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
|
1846 |
+
codec_sr = gr.Number(label='codec', interactive=True, value=50)
|
1847 |
+
top_k = gr.Number(label='top_k', interactive=True, value=0)
|
1848 |
+
top_p = gr.Number(label='top_p', interactive=True, value=0.8)
|
1849 |
+
temperature = gr.Number(label='temperature', interactive=True, value=1)
|
1850 |
+
kvcache = gr.Number(label='kvcache', interactive=True, value=1,
|
1851 |
+
info='set to 0 to use less VRAM, results may be worse and slower inference')
|
1852 |
+
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
|
1853 |
with gr.Accordion("Index Settings", open=False):
|
1854 |
#with gr.Row():
|
1855 |
|
|
|
1980 |
with gr.Row():
|
1981 |
vc_output1 = gr.Textbox("")
|
1982 |
f0_file = gr.File(label="f0 file", visible=False)
|
1983 |
+
|
1984 |
+
run_btn.click(fn=run,
|
1985 |
+
inputs=[
|
1986 |
+
seed,
|
1987 |
+
stop_repitition,
|
1988 |
+
sample_batch_size,
|
1989 |
+
left_margin,
|
1990 |
+
right_margin,
|
1991 |
+
codecaudio_sr,
|
1992 |
+
codec_sr,
|
1993 |
+
top_k,
|
1994 |
+
top_p,
|
1995 |
+
temperature,
|
1996 |
+
kvcache,
|
1997 |
+
cutoff_value,
|
1998 |
+
target_transcript,
|
1999 |
+
silence_tokens,
|
2000 |
+
transcribed_text],
|
2001 |
+
outputs=[
|
2002 |
+
output_audio_con,
|
2003 |
+
output_audio_gen
|
2004 |
+
])
|
2005 |
|
2006 |
but0.click(
|
2007 |
vc_single,
|
audios/happy demo.wav
ADDED
Binary file (116 kB). View file
|
|
hubert_base_hf_statedict.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45005b220ba02f5491472e63bed8a4be2c8c22bf4ed27f983386f9279c5f506c
|
3 |
+
size 377560144
|