ajayarora1235 commited on
Commit
4d7a16e
·
1 Parent(s): 6736ecf

final app for clas

Browse files
Files changed (1) hide show
  1. app.py +79 -72
app.py CHANGED
@@ -1398,7 +1398,7 @@ def download_from_url(url, model, associated_user=None):
1398
  os.makedirs("unzips", exist_ok=True)
1399
  zipfile = model + '.zip'
1400
  zipfile_path = './zips/' + zipfile
1401
- return
1402
  try:
1403
  if "drive.google.com" in url or "drive.usercontent.google.com":
1404
  subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
@@ -1483,7 +1483,8 @@ def transcribe_btn_click(audio_choice):
1483
  transcript_fn = f"{temp_folder}/{filename}.txt"
1484
  if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
1485
  print("Audio and transcript already exist, skipping transcript")
1486
- return
 
1487
 
1488
  batch_size = 1 # Adjust based on your GPU memory availability
1489
  compute_type = "float16"
@@ -1523,24 +1524,24 @@ def transcribe_btn_click(audio_choice):
1523
  return result
1524
 
1525
 
1526
- def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1527
  temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
1528
  global voicecraft_model, voicecraft_config, phn2num
1529
 
1530
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1531
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1532
- os.environ["USER"] = "USER"
1533
-
1534
  print("Transcribing the input audio")
1535
- transcribe_btn_click(input_audio_fn)
1536
  print("Transcription complete")
1537
 
 
 
 
1538
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1539
  cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1540
  target_transcript = transcribed_text + target_transcript
1541
  print(target_transcript)
1542
  info = torchaudio.info(audio_fn)
1543
  audio_dur = info.num_frames / info.sample_rate
 
1544
 
1545
  print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
1546
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
@@ -1584,7 +1585,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
1584
  return [seg_save_fn_concat, seg_save_fn_gen]
1585
 
1586
  def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1587
- temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
1588
  sid,
1589
  f0_up_key,
1590
  f0_file,
@@ -1601,19 +1602,20 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
1601
  global voicecraft_model, voicecraft_config, phn2num
1602
 
1603
  print("Transcribing the input audio")
1604
- transcribe_btn_click(input_audio_fn)
1605
- print("Transcription complete")
1606
 
1607
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1608
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1609
  os.environ["USER"] = "USER"
1610
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1611
- cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1612
 
1613
  target_transcript = transcribed_text + ' ' + target_transcript
1614
  print(target_transcript)
1615
  info = torchaudio.info(audio_fn)
1616
  audio_dur = info.num_frames / info.sample_rate
 
1617
 
1618
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1619
  prompt_end_frame = int(cut_off_sec * info.sample_rate)
@@ -1711,7 +1713,7 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
1711
  times[0],
1712
  times[1],
1713
  times[2],
1714
- ), (tgt_sr, audio_opt)
1715
  except:
1716
  info = traceback.format_exc()
1717
  print(info)
@@ -1803,22 +1805,25 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1803
  with gr.Column():
1804
  # with gr.Row():
1805
  # dropbox = gr.File(label="Drag your audio file and click refresh.")
1806
- # with gr.Row():
1807
- # record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
1808
  with gr.Row():
1809
  input_audio0 = gr.Dropdown(
1810
  label="2.Choose the audio file.",
1811
  value="./audios/Test_Audio.mp3",
1812
  choices=audio_files
1813
- )
 
1814
  # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
1815
  # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
1816
  refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
1817
- transcribed_text = gr.Textbox(label="transcibed text + mfa",
1818
- value="The dogs sat at the door.",
1819
- info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1820
- # record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
1821
- # record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
 
 
1822
 
1823
  with gr.Row():
1824
  # with gr.Column():
@@ -1835,11 +1840,6 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1835
 
1836
  with gr.Column():
1837
  target_transcript = gr.Textbox(label="target transcript")
1838
- output_audio_con = gr.Audio(label="Output Audio concatenated")
1839
- output_audio_gen = gr.Audio(label="Output Audio generated")
1840
- cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1841
- run_btn = gr.Button(value="run")
1842
- run_btn_joint = gr.Button(value="run with RVC")
1843
 
1844
  # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
1845
  # outputs=[transcribed_text])
@@ -1847,8 +1847,16 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1847
 
1848
 
1849
  with gr.Column():
 
 
 
 
 
 
 
 
1850
  vc_output2 = gr.Audio(
1851
- label="Final Result! (Click on the three dots to download the audio)",
1852
  type='filepath',
1853
  interactive=False,
1854
  )
@@ -2003,50 +2011,51 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
2003
  vc_output1 = gr.Textbox("")
2004
  f0_file = gr.File(label="f0 file", visible=False)
2005
 
2006
- run_btn.click(fn=run,
2007
- inputs=[
2008
- seed,
2009
- stop_repitition,
2010
- sample_batch_size,
2011
- left_margin,
2012
- right_margin,
2013
- codecaudio_sr,
2014
- codec_sr,
2015
- top_k,
2016
- top_p,
2017
- temperature,
2018
- kvcache,
2019
- cutoff_value,
2020
- target_transcript,
2021
- silence_tokens,
2022
- transcribed_text],
2023
- outputs=[
2024
- output_audio_con,
2025
- output_audio_gen
2026
- ])
 
2027
 
2028
- but0.click(
2029
- vc_single,
2030
- [
2031
- spk_item,
2032
- input_audio0,
2033
- vc_transform0,
2034
- f0_file,
2035
- f0method0,
2036
- file_index1,
2037
- # file_index2,
2038
- # file_big_npy1,
2039
- index_rate1,
2040
- filter_radius0,
2041
- resample_sr0,
2042
- rms_mix_rate0,
2043
- protect0,
2044
- crepe_hop_length
2045
- ],
2046
- [vc_output1, vc_output2],
2047
- )
2048
 
2049
- run_btn_joint.click(
2050
  fn=run_joint,
2051
  inputs=[
2052
  input_audio0,
@@ -2061,10 +2070,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
2061
  top_p,
2062
  temperature,
2063
  kvcache,
2064
- cutoff_value,
2065
  target_transcript,
2066
  silence_tokens,
2067
- transcribed_text,
2068
  spk_item,
2069
  vc_transform0,
2070
  f0_file,
@@ -2079,7 +2086,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
2079
  protect0,
2080
  crepe_hop_length
2081
  ],
2082
- outputs=[vc_output1, vc_output2])
2083
 
2084
  with gr.Accordion("Batch Conversion",open=False, visible=False):
2085
  with gr.Row():
 
1398
  os.makedirs("unzips", exist_ok=True)
1399
  zipfile = model + '.zip'
1400
  zipfile_path = './zips/' + zipfile
1401
+
1402
  try:
1403
  if "drive.google.com" in url or "drive.usercontent.google.com":
1404
  subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
 
1483
  transcript_fn = f"{temp_folder}/{filename}.txt"
1484
  if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
1485
  print("Audio and transcript already exist, skipping transcript")
1486
+ transcript = open(transcript_fn, "r").read()
1487
+ return transcript
1488
 
1489
  batch_size = 1 # Adjust based on your GPU memory availability
1490
  compute_type = "float16"
 
1524
  return result
1525
 
1526
 
1527
+ def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1528
  temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
1529
  global voicecraft_model, voicecraft_config, phn2num
1530
 
 
 
 
 
1531
  print("Transcribing the input audio")
1532
+ transcribed_text = transcribe_btn_click(input_audio_fn)
1533
  print("Transcription complete")
1534
 
1535
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1536
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1537
+ os.environ["USER"] = "USER"
1538
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1539
  cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1540
  target_transcript = transcribed_text + target_transcript
1541
  print(target_transcript)
1542
  info = torchaudio.info(audio_fn)
1543
  audio_dur = info.num_frames / info.sample_rate
1544
+ print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
1545
 
1546
  print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
1547
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
 
1585
  return [seg_save_fn_concat, seg_save_fn_gen]
1586
 
1587
  def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1588
+ temperature, kvcache, target_transcript, silence_tokens,
1589
  sid,
1590
  f0_up_key,
1591
  f0_file,
 
1602
  global voicecraft_model, voicecraft_config, phn2num
1603
 
1604
  print("Transcribing the input audio")
1605
+ transcribed_text = transcribe_btn_click(input_audio_fn)
1606
+ print("Transcription complete", transcribed_text)
1607
 
1608
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1609
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1610
  os.environ["USER"] = "USER"
1611
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1612
+ # cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1613
 
1614
  target_transcript = transcribed_text + ' ' + target_transcript
1615
  print(target_transcript)
1616
  info = torchaudio.info(audio_fn)
1617
  audio_dur = info.num_frames / info.sample_rate
1618
+ cut_off_sec = audio_dur - 0.1
1619
 
1620
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1621
  prompt_end_frame = int(cut_off_sec * info.sample_rate)
 
1713
  times[0],
1714
  times[1],
1715
  times[2],
1716
+ ), seg_save_fn_gen, (tgt_sr, audio_opt)
1717
  except:
1718
  info = traceback.format_exc()
1719
  print(info)
 
1805
  with gr.Column():
1806
  # with gr.Row():
1807
  # dropbox = gr.File(label="Drag your audio file and click refresh.")
1808
+ with gr.Row():
1809
+ record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
1810
  with gr.Row():
1811
  input_audio0 = gr.Dropdown(
1812
  label="2.Choose the audio file.",
1813
  value="./audios/Test_Audio.mp3",
1814
  choices=audio_files
1815
+ )
1816
+ audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
1817
  # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
1818
  # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
1819
  refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
1820
+ # transcribed_text = gr.Textbox(label="transcibed text + mfa",
1821
+ # value="The dogs sat at the door.",
1822
+ # info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1823
+ record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
1824
+ record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
1825
+ # update audio_display
1826
+ input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
1827
 
1828
  with gr.Row():
1829
  # with gr.Column():
 
1840
 
1841
  with gr.Column():
1842
  target_transcript = gr.Textbox(label="target transcript")
 
 
 
 
 
1843
 
1844
  # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
1845
  # outputs=[transcribed_text])
 
1847
 
1848
 
1849
  with gr.Column():
1850
+
1851
+ output_audio_gen = gr.Audio(
1852
+ label="Output Audio generated",
1853
+ type='filepath',
1854
+ interactive=False
1855
+ )
1856
+
1857
+
1858
  vc_output2 = gr.Audio(
1859
+ label="Voice converted! (Click on the three dots to download the audio)",
1860
  type='filepath',
1861
  interactive=False,
1862
  )
 
2011
  vc_output1 = gr.Textbox("")
2012
  f0_file = gr.File(label="f0 file", visible=False)
2013
 
2014
+ # run_btn.click(fn=run,
2015
+ # inputs=[
2016
+ # input_audio0,
2017
+ # seed,
2018
+ # stop_repitition,
2019
+ # sample_batch_size,
2020
+ # left_margin,
2021
+ # right_margin,
2022
+ # codecaudio_sr,
2023
+ # codec_sr,
2024
+ # top_k,
2025
+ # top_p,
2026
+ # temperature,
2027
+ # kvcache,
2028
+ # cutoff_value,
2029
+ # target_transcript,
2030
+ # silence_tokens,
2031
+ # transcribed_text],
2032
+ # outputs=[
2033
+ # output_audio_con,
2034
+ # output_audio_gen
2035
+ # ])
2036
 
2037
+ # but0.click(
2038
+ # vc_single,
2039
+ # [
2040
+ # spk_item,
2041
+ # input_audio0,
2042
+ # vc_transform0,
2043
+ # f0_file,
2044
+ # f0method0,
2045
+ # file_index1,
2046
+ # # file_index2,
2047
+ # # file_big_npy1,
2048
+ # index_rate1,
2049
+ # filter_radius0,
2050
+ # resample_sr0,
2051
+ # rms_mix_rate0,
2052
+ # protect0,
2053
+ # crepe_hop_length
2054
+ # ],
2055
+ # [vc_output1, vc_output2],
2056
+ # )
2057
 
2058
+ but0.click(
2059
  fn=run_joint,
2060
  inputs=[
2061
  input_audio0,
 
2070
  top_p,
2071
  temperature,
2072
  kvcache,
 
2073
  target_transcript,
2074
  silence_tokens,
 
2075
  spk_item,
2076
  vc_transform0,
2077
  f0_file,
 
2086
  protect0,
2087
  crepe_hop_length
2088
  ],
2089
+ outputs=[vc_output1, output_audio_gen, vc_output2])
2090
 
2091
  with gr.Accordion("Batch Conversion",open=False, visible=False):
2092
  with gr.Row():