Commit
·
4d7a16e
1
Parent(s):
6736ecf
final app for clas
Browse files
app.py
CHANGED
@@ -1398,7 +1398,7 @@ def download_from_url(url, model, associated_user=None):
|
|
1398 |
os.makedirs("unzips", exist_ok=True)
|
1399 |
zipfile = model + '.zip'
|
1400 |
zipfile_path = './zips/' + zipfile
|
1401 |
-
|
1402 |
try:
|
1403 |
if "drive.google.com" in url or "drive.usercontent.google.com":
|
1404 |
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
@@ -1483,7 +1483,8 @@ def transcribe_btn_click(audio_choice):
|
|
1483 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
1484 |
if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
|
1485 |
print("Audio and transcript already exist, skipping transcript")
|
1486 |
-
|
|
|
1487 |
|
1488 |
batch_size = 1 # Adjust based on your GPU memory availability
|
1489 |
compute_type = "float16"
|
@@ -1523,24 +1524,24 @@ def transcribe_btn_click(audio_choice):
|
|
1523 |
return result
|
1524 |
|
1525 |
|
1526 |
-
def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1527 |
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
|
1528 |
global voicecraft_model, voicecraft_config, phn2num
|
1529 |
|
1530 |
-
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1531 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1532 |
-
os.environ["USER"] = "USER"
|
1533 |
-
|
1534 |
print("Transcribing the input audio")
|
1535 |
-
transcribe_btn_click(input_audio_fn)
|
1536 |
print("Transcription complete")
|
1537 |
|
|
|
|
|
|
|
1538 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1539 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1540 |
target_transcript = transcribed_text + target_transcript
|
1541 |
print(target_transcript)
|
1542 |
info = torchaudio.info(audio_fn)
|
1543 |
audio_dur = info.num_frames / info.sample_rate
|
|
|
1544 |
|
1545 |
print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
|
1546 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
@@ -1584,7 +1585,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
1584 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
1585 |
|
1586 |
def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1587 |
-
temperature, kvcache,
|
1588 |
sid,
|
1589 |
f0_up_key,
|
1590 |
f0_file,
|
@@ -1601,19 +1602,20 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
|
|
1601 |
global voicecraft_model, voicecraft_config, phn2num
|
1602 |
|
1603 |
print("Transcribing the input audio")
|
1604 |
-
transcribe_btn_click(input_audio_fn)
|
1605 |
-
print("Transcription complete")
|
1606 |
|
1607 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1608 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1609 |
os.environ["USER"] = "USER"
|
1610 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1611 |
-
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1612 |
|
1613 |
target_transcript = transcribed_text + ' ' + target_transcript
|
1614 |
print(target_transcript)
|
1615 |
info = torchaudio.info(audio_fn)
|
1616 |
audio_dur = info.num_frames / info.sample_rate
|
|
|
1617 |
|
1618 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1619 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
@@ -1711,7 +1713,7 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
|
|
1711 |
times[0],
|
1712 |
times[1],
|
1713 |
times[2],
|
1714 |
-
), (tgt_sr, audio_opt)
|
1715 |
except:
|
1716 |
info = traceback.format_exc()
|
1717 |
print(info)
|
@@ -1803,22 +1805,25 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1803 |
with gr.Column():
|
1804 |
# with gr.Row():
|
1805 |
# dropbox = gr.File(label="Drag your audio file and click refresh.")
|
1806 |
-
|
1807 |
-
|
1808 |
with gr.Row():
|
1809 |
input_audio0 = gr.Dropdown(
|
1810 |
label="2.Choose the audio file.",
|
1811 |
value="./audios/Test_Audio.mp3",
|
1812 |
choices=audio_files
|
1813 |
-
|
|
|
1814 |
# dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
1815 |
# dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1816 |
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
1817 |
-
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1818 |
-
|
1819 |
-
|
1820 |
-
|
1821 |
-
|
|
|
|
|
1822 |
|
1823 |
with gr.Row():
|
1824 |
# with gr.Column():
|
@@ -1835,11 +1840,6 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1835 |
|
1836 |
with gr.Column():
|
1837 |
target_transcript = gr.Textbox(label="target transcript")
|
1838 |
-
output_audio_con = gr.Audio(label="Output Audio concatenated")
|
1839 |
-
output_audio_gen = gr.Audio(label="Output Audio generated")
|
1840 |
-
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1841 |
-
run_btn = gr.Button(value="run")
|
1842 |
-
run_btn_joint = gr.Button(value="run with RVC")
|
1843 |
|
1844 |
# transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
1845 |
# outputs=[transcribed_text])
|
@@ -1847,8 +1847,16 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1847 |
|
1848 |
|
1849 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1850 |
vc_output2 = gr.Audio(
|
1851 |
-
label="
|
1852 |
type='filepath',
|
1853 |
interactive=False,
|
1854 |
)
|
@@ -2003,50 +2011,51 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
2003 |
vc_output1 = gr.Textbox("")
|
2004 |
f0_file = gr.File(label="f0 file", visible=False)
|
2005 |
|
2006 |
-
run_btn.click(fn=run,
|
2007 |
-
|
2008 |
-
|
2009 |
-
|
2010 |
-
|
2011 |
-
|
2012 |
-
|
2013 |
-
|
2014 |
-
|
2015 |
-
|
2016 |
-
|
2017 |
-
|
2018 |
-
|
2019 |
-
|
2020 |
-
|
2021 |
-
|
2022 |
-
|
2023 |
-
|
2024 |
-
|
2025 |
-
|
2026 |
-
|
|
|
2027 |
|
2028 |
-
but0.click(
|
2029 |
-
|
2030 |
-
|
2031 |
-
|
2032 |
-
|
2033 |
-
|
2034 |
-
|
2035 |
-
|
2036 |
-
|
2037 |
-
|
2038 |
-
|
2039 |
-
|
2040 |
-
|
2041 |
-
|
2042 |
-
|
2043 |
-
|
2044 |
-
|
2045 |
-
|
2046 |
-
|
2047 |
-
)
|
2048 |
|
2049 |
-
|
2050 |
fn=run_joint,
|
2051 |
inputs=[
|
2052 |
input_audio0,
|
@@ -2061,10 +2070,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
2061 |
top_p,
|
2062 |
temperature,
|
2063 |
kvcache,
|
2064 |
-
cutoff_value,
|
2065 |
target_transcript,
|
2066 |
silence_tokens,
|
2067 |
-
transcribed_text,
|
2068 |
spk_item,
|
2069 |
vc_transform0,
|
2070 |
f0_file,
|
@@ -2079,7 +2086,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
2079 |
protect0,
|
2080 |
crepe_hop_length
|
2081 |
],
|
2082 |
-
outputs=[vc_output1, vc_output2])
|
2083 |
|
2084 |
with gr.Accordion("Batch Conversion",open=False, visible=False):
|
2085 |
with gr.Row():
|
|
|
1398 |
os.makedirs("unzips", exist_ok=True)
|
1399 |
zipfile = model + '.zip'
|
1400 |
zipfile_path = './zips/' + zipfile
|
1401 |
+
|
1402 |
try:
|
1403 |
if "drive.google.com" in url or "drive.usercontent.google.com":
|
1404 |
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
|
|
1483 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
1484 |
if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
|
1485 |
print("Audio and transcript already exist, skipping transcript")
|
1486 |
+
transcript = open(transcript_fn, "r").read()
|
1487 |
+
return transcript
|
1488 |
|
1489 |
batch_size = 1 # Adjust based on your GPU memory availability
|
1490 |
compute_type = "float16"
|
|
|
1524 |
return result
|
1525 |
|
1526 |
|
1527 |
+
def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1528 |
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
|
1529 |
global voicecraft_model, voicecraft_config, phn2num
|
1530 |
|
|
|
|
|
|
|
|
|
1531 |
print("Transcribing the input audio")
|
1532 |
+
transcribed_text = transcribe_btn_click(input_audio_fn)
|
1533 |
print("Transcription complete")
|
1534 |
|
1535 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1536 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1537 |
+
os.environ["USER"] = "USER"
|
1538 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1539 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1540 |
target_transcript = transcribed_text + target_transcript
|
1541 |
print(target_transcript)
|
1542 |
info = torchaudio.info(audio_fn)
|
1543 |
audio_dur = info.num_frames / info.sample_rate
|
1544 |
+
print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
|
1545 |
|
1546 |
print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
|
1547 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
|
|
1585 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
1586 |
|
1587 |
def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1588 |
+
temperature, kvcache, target_transcript, silence_tokens,
|
1589 |
sid,
|
1590 |
f0_up_key,
|
1591 |
f0_file,
|
|
|
1602 |
global voicecraft_model, voicecraft_config, phn2num
|
1603 |
|
1604 |
print("Transcribing the input audio")
|
1605 |
+
transcribed_text = transcribe_btn_click(input_audio_fn)
|
1606 |
+
print("Transcription complete", transcribed_text)
|
1607 |
|
1608 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1609 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1610 |
os.environ["USER"] = "USER"
|
1611 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1612 |
+
# cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1613 |
|
1614 |
target_transcript = transcribed_text + ' ' + target_transcript
|
1615 |
print(target_transcript)
|
1616 |
info = torchaudio.info(audio_fn)
|
1617 |
audio_dur = info.num_frames / info.sample_rate
|
1618 |
+
cut_off_sec = audio_dur - 0.1
|
1619 |
|
1620 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1621 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
|
|
1713 |
times[0],
|
1714 |
times[1],
|
1715 |
times[2],
|
1716 |
+
), seg_save_fn_gen, (tgt_sr, audio_opt)
|
1717 |
except:
|
1718 |
info = traceback.format_exc()
|
1719 |
print(info)
|
|
|
1805 |
with gr.Column():
|
1806 |
# with gr.Row():
|
1807 |
# dropbox = gr.File(label="Drag your audio file and click refresh.")
|
1808 |
+
with gr.Row():
|
1809 |
+
record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
|
1810 |
with gr.Row():
|
1811 |
input_audio0 = gr.Dropdown(
|
1812 |
label="2.Choose the audio file.",
|
1813 |
value="./audios/Test_Audio.mp3",
|
1814 |
choices=audio_files
|
1815 |
+
)
|
1816 |
+
audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
|
1817 |
# dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
1818 |
# dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1819 |
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
1820 |
+
# transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1821 |
+
# value="The dogs sat at the door.",
|
1822 |
+
# info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
1823 |
+
record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
|
1824 |
+
record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
1825 |
+
# update audio_display
|
1826 |
+
input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
|
1827 |
|
1828 |
with gr.Row():
|
1829 |
# with gr.Column():
|
|
|
1840 |
|
1841 |
with gr.Column():
|
1842 |
target_transcript = gr.Textbox(label="target transcript")
|
|
|
|
|
|
|
|
|
|
|
1843 |
|
1844 |
# transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
1845 |
# outputs=[transcribed_text])
|
|
|
1847 |
|
1848 |
|
1849 |
with gr.Column():
|
1850 |
+
|
1851 |
+
output_audio_gen = gr.Audio(
|
1852 |
+
label="Output Audio generated",
|
1853 |
+
type='filepath',
|
1854 |
+
interactive=False
|
1855 |
+
)
|
1856 |
+
|
1857 |
+
|
1858 |
vc_output2 = gr.Audio(
|
1859 |
+
label="Voice converted! (Click on the three dots to download the audio)",
|
1860 |
type='filepath',
|
1861 |
interactive=False,
|
1862 |
)
|
|
|
2011 |
vc_output1 = gr.Textbox("")
|
2012 |
f0_file = gr.File(label="f0 file", visible=False)
|
2013 |
|
2014 |
+
# run_btn.click(fn=run,
|
2015 |
+
# inputs=[
|
2016 |
+
# input_audio0,
|
2017 |
+
# seed,
|
2018 |
+
# stop_repitition,
|
2019 |
+
# sample_batch_size,
|
2020 |
+
# left_margin,
|
2021 |
+
# right_margin,
|
2022 |
+
# codecaudio_sr,
|
2023 |
+
# codec_sr,
|
2024 |
+
# top_k,
|
2025 |
+
# top_p,
|
2026 |
+
# temperature,
|
2027 |
+
# kvcache,
|
2028 |
+
# cutoff_value,
|
2029 |
+
# target_transcript,
|
2030 |
+
# silence_tokens,
|
2031 |
+
# transcribed_text],
|
2032 |
+
# outputs=[
|
2033 |
+
# output_audio_con,
|
2034 |
+
# output_audio_gen
|
2035 |
+
# ])
|
2036 |
|
2037 |
+
# but0.click(
|
2038 |
+
# vc_single,
|
2039 |
+
# [
|
2040 |
+
# spk_item,
|
2041 |
+
# input_audio0,
|
2042 |
+
# vc_transform0,
|
2043 |
+
# f0_file,
|
2044 |
+
# f0method0,
|
2045 |
+
# file_index1,
|
2046 |
+
# # file_index2,
|
2047 |
+
# # file_big_npy1,
|
2048 |
+
# index_rate1,
|
2049 |
+
# filter_radius0,
|
2050 |
+
# resample_sr0,
|
2051 |
+
# rms_mix_rate0,
|
2052 |
+
# protect0,
|
2053 |
+
# crepe_hop_length
|
2054 |
+
# ],
|
2055 |
+
# [vc_output1, vc_output2],
|
2056 |
+
# )
|
2057 |
|
2058 |
+
but0.click(
|
2059 |
fn=run_joint,
|
2060 |
inputs=[
|
2061 |
input_audio0,
|
|
|
2070 |
top_p,
|
2071 |
temperature,
|
2072 |
kvcache,
|
|
|
2073 |
target_transcript,
|
2074 |
silence_tokens,
|
|
|
2075 |
spk_item,
|
2076 |
vc_transform0,
|
2077 |
f0_file,
|
|
|
2086 |
protect0,
|
2087 |
crepe_hop_length
|
2088 |
],
|
2089 |
+
outputs=[vc_output1, output_audio_gen, vc_output2])
|
2090 |
|
2091 |
with gr.Accordion("Batch Conversion",open=False, visible=False):
|
2092 |
with gr.Row():
|