kevinwang676 commited on
Commit
3a8af5e
·
verified ·
1 Parent(s): bec574b

Rename GPT_SoVITS/test.py to GPT_SoVITS/vc_webui.py

Browse files
Files changed (1) hide show
  1. GPT_SoVITS/{test.py → vc_webui.py} +288 -84
GPT_SoVITS/{test.py → vc_webui.py} RENAMED
@@ -27,13 +27,13 @@ try:
27
  import gradio.analytics as analytics
28
  analytics.version_check = lambda:None
29
  except:...
30
- version=model_version=os.environ.get("version","v2")
31
- pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth","GPT_SoVITS/pretrained_models/s2Gv3.pth"]
32
- pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"]
33
 
34
 
35
  _ =[[],[]]
36
- for i in range(3):
37
  if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
38
  if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
39
  pretrained_gpt_name,pretrained_sovits_name = _
@@ -823,88 +823,292 @@ def html_left(text, label='p'):
823
  </div>"""
824
 
825
 
826
- with gr.Blocks(title="GPT-SoVITS WebUI") as app:
827
- gr.Markdown(
828
- value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
829
- )
830
- with gr.Group():
831
- gr.Markdown(html_center(i18n("模型切换"),'h3'))
832
- with gr.Row():
833
- GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True, scale=14)
834
- SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True, scale=14)
835
- refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14)
836
- refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
837
- gr.Markdown(html_center(i18n("*请上传并填写参考信息"),'h3'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
  with gr.Row():
839
- inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath", scale=13)
840
- with gr.Column(scale=13):
841
- ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式,使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
842
- gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。")))
843
- prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
844
- with gr.Column(scale=14):
845
- prompt_language = gr.Dropdown(
846
- label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"),
847
  )
848
- inp_refs = gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple")if model_version!="v3"else gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple",visible=False)
849
- sample_steps = gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=64,choices=[4,8,16,32,64],visible=True)if model_version=="v3"else gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=8,choices=[4,8,16,32],visible=False)
850
- gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3'))
851
- with gr.Row():
852
- with gr.Column(scale=13):
853
- text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26)
854
- with gr.Column(scale=7):
855
- text_language = gr.Dropdown(
856
- label=i18n("需要合成的语种")+i18n(".限制范围越小判别效果越好。"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1
857
- )
858
- how_to_cut = gr.Dropdown(
859
- label=i18n("怎么切"),
860
- choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
861
- value=i18n("凑四句一切"),
862
- interactive=True, scale=1
863
- )
864
- gr.Markdown(value=html_center(i18n("语速调整,高为更快")))
865
- if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), value=False, interactive=True,show_label=True, scale=1)
866
- speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True, scale=1)
867
- gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):")))
868
- top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=15,interactive=True, scale=1)
869
- top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True, scale=1)
870
- temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True, scale=1)
871
- # with gr.Column():
872
- # gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。"))
873
- # phoneme=gr.Textbox(label=i18n("音素框"), value="")
874
- # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
875
- with gr.Row():
876
- inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
877
- output = gr.Audio(label=i18n("输出的语音"), scale=14)
878
-
879
- inference_button.click(
880
- get_tts_wav,
881
- [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps],
882
- [output],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
  )
884
- SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free])
885
- GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
886
-
887
- # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
888
- # with gr.Row():
889
- # text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
890
- # button1 = gr.Button(i18n("凑四句一切"), variant="primary")
891
- # button2 = gr.Button(i18n("凑50字一切"), variant="primary")
892
- # button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
893
- # button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
894
- # button5 = gr.Button(i18n("按标点符号切"), variant="primary")
895
- # text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
896
- # button1.click(cut1, [text_inp], [text_opt])
897
- # button2.click(cut2, [text_inp], [text_opt])
898
- # button3.click(cut3, [text_inp], [text_opt])
899
- # button4.click(cut4, [text_inp], [text_opt])
900
- # button5.click(cut5, [text_inp], [text_opt])
901
- # gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")))
902
-
903
- if __name__ == '__main__':
904
- app.queue().launch(#concurrency_count=511, max_size=1022
905
- server_name="0.0.0.0",
906
- inbrowser=True,
907
  share=True,
908
- server_port=8000,
909
- quiet=True,
910
  )
 
 
 
 
 
27
  import gradio.analytics as analytics
28
  analytics.version_check = lambda:None
29
  except:...
30
+ version=model_version="v3"
31
+ pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2Gv3.pth"]
32
+ pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1v3.ckpt"]
33
 
34
 
35
  _ =[[],[]]
36
+ for i in range(1):
37
  if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
38
  if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
39
  pretrained_gpt_name,pretrained_sovits_name = _
 
823
  </div>"""
824
 
825
 
826
+ import gradio as gr
827
+ import torch
828
+ import torch.nn.functional as F
829
+ import numpy as np
830
+ import torchaudio
831
+ import librosa
832
+
833
+ def get_code_from_wav(wav_path):
834
+ """Extract codes from input speech audio"""
835
+ wav16k, sr = librosa.load(wav_path, sr=16000)
836
+ wav16k = torch.from_numpy(wav16k)
837
+ if is_half:
838
+ wav16k = wav16k.half().to(device)
839
+ else:
840
+ wav16k = wav16k.to(device)
841
+
842
+ # Extract SSL features
843
+ ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)
844
+
845
+ # Extract latent codes from SSL features
846
+ codes = vq_model.extract_latent(ssl_content)
847
+
848
+ return codes
849
+
850
+ def vc_main(wav_path, text, language, prompt_wav, noise_scale=0.5, top_k=20, top_p=0.6, temperature=0.6, speed=1, sample_steps=8):
851
+ """
852
+ Voice Conversion function that supports both v2 and v3 model versions
853
+
854
+ Args:
855
+ wav_path: Path to source audio for conversion
856
+ text: Corresponding text for phoneme extraction
857
+ language: Language of the text
858
+ prompt_wav: Path to target/reference voice
859
+ noise_scale: Noise scale for v2 models
860
+ top_k, top_p, temperature: Parameters for v3 models
861
+ speed: Speed factor for audio playback
862
+ sample_steps: Number of sample steps for v3 models
863
+
864
+ Returns:
865
+ Sampling rate and converted audio
866
+ """
867
+ # Get language format
868
+ language = dict_language[language]
869
+
870
+ # Get phones from text
871
+ phones, word2ph, norm_text = clean_text_inf(text, language, version)
872
+
873
+ # Get reference audio spectrogram
874
+ refer = get_spepc(hps, prompt_wav).to(dtype).to(device)
875
+
876
+ # Get codes from source audio
877
+ source_codes = get_code_from_wav(wav_path)
878
+
879
+ if model_version != "v3":
880
+ # V1/V2 models voice conversion logic
881
+ ge = vq_model.ref_enc(refer) # [B, D, T/1]
882
+ quantized = vq_model.quantizer.decode(source_codes[None, None]) # [B, D, T]
883
+
884
+ # Interpolate if necessary for 25hz models
885
+ if hps.model.semantic_frame_rate == "25hz":
886
+ quantized = F.interpolate(
887
+ quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
888
+ )
889
+
890
+ m_p, logs_p, y_mask = vq_model.enc_p(
891
+ quantized,
892
+ torch.LongTensor([quantized.shape[-1]]).to(device),
893
+ torch.LongTensor(phones).to(device).unsqueeze(0),
894
+ torch.LongTensor([len(phones)]).to(device),
895
+ ge
896
+ )
897
+
898
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
899
+ z = vq_model.flow(z_p, y_mask, g=ge, reverse=True)
900
+ o = vq_model.dec((z * y_mask)[:, :, :], g=ge) # [B, D=1, T], torch.float32 (-1, 1)
901
+ audio = o.detach().cpu().numpy()[0, 0]
902
+
903
+ else:
904
+ # V3 model voice conversion logic
905
+ if model is None:
906
+ init_bigvgan()
907
+
908
+ # For v3 models, inspect shape and prepare correctly
909
+ # The shape problem is in how the codes are being passed to decode_encp
910
+
911
+ # The error is: "b n d -> b d n" expects 3D tensor but got 4D: [1, 1, 225, 768]
912
+ # This suggests source_codes may have a shape like [225, 768] or [1, 225, 768]
913
+
914
+ # Prepare the semantic tensor for v3, ensuring it has the correct shape
915
+ if source_codes.dim() == 3: # If [B, T, D]
916
+ semantic = source_codes
917
+ elif source_codes.dim() == 2: # If [T, D]
918
+ semantic = source_codes.unsqueeze(0) # Add batch dimension [1, T, D]
919
+ else:
920
+ # Handle unexpected shapes
921
+ raise ValueError(f"Unexpected source_codes shape: {source_codes.shape}")
922
+
923
+ # Prepare phoneme IDs
924
+ phoneme_ids = torch.LongTensor(phones).to(device).unsqueeze(0)
925
+
926
+ # Get reference audio features and global embedding
927
+ fea_ref, ge = vq_model.decode_encp(semantic, phoneme_ids, refer)
928
+
929
+ # Load and process reference audio
930
+ ref_audio, sr = torchaudio.load(prompt_wav)
931
+ ref_audio = ref_audio.to(device).float()
932
+ if ref_audio.shape[0] == 2: # Convert stereo to mono
933
+ ref_audio = ref_audio.mean(0).unsqueeze(0)
934
+ if sr != 24000:
935
+ ref_audio = resample(ref_audio, sr)
936
+
937
+ # Convert to mel spectrogram and normalize
938
+ mel2 = mel_fn(ref_audio.to(dtype))
939
+ mel2 = norm_spec(mel2)
940
+
941
+ # Adjust time dimensions
942
+ T_min = min(mel2.shape[2], fea_ref.shape[2])
943
+ mel2 = mel2[:, :, :T_min]
944
+ fea_ref = fea_ref[:, :, :T_min]
945
+
946
+ if T_min > 468:
947
+ mel2 = mel2[:, :, -468:]
948
+ fea_ref = fea_ref[:, :, -468:]
949
+ T_min = 468
950
+
951
+ # Process source audio features with phoneme conditioning
952
+ fea_todo, ge = vq_model.decode_encp(semantic, phoneme_ids, refer, ge)
953
+
954
+ # Process audio in chunks
955
+ chunk_len = 934 - T_min
956
+ cfm_resss = []
957
+ idx = 0
958
+
959
+ while True:
960
+ fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len]
961
+ if fea_todo_chunk.shape[-1] == 0:
962
+ break
963
+
964
+ idx += chunk_len
965
+ fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
966
+ cfm_res = vq_model.cfm.inference(
967
+ fea,
968
+ torch.LongTensor([fea.size(1)]).to(fea.device),
969
+ mel2,
970
+ sample_steps,
971
+ inference_cfg_rate=0
972
+ )
973
+
974
+ cfm_res = cfm_res[:, :, mel2.shape[2]:]
975
+ mel2 = cfm_res[:, :, -T_min:]
976
+ fea_ref = fea_todo_chunk[:, :, -T_min:]
977
+ cfm_resss.append(cfm_res)
978
+
979
+ # Concatenate results and convert to audio
980
+ cmf_res = torch.cat(cfm_resss, 2)
981
+ cmf_res = denorm_spec(cmf_res)
982
+
983
+ with torch.inference_mode():
984
+ wav_gen = model(cmf_res)
985
+ audio = wav_gen[0][0].cpu().detach().numpy()
986
+
987
+ # Normalize audio to prevent clipping
988
+ max_audio = np.abs(audio).max()
989
+ if max_audio > 1:
990
+ audio /= max_audio
991
+
992
+ sr = hps.data.sampling_rate if model_version != "v3" else 24000
993
+ return sr, (audio * 32768).astype(np.int16)
994
+
995
+ # Create and launch the standalone Gradio interface for voice conversion
996
+ def launch_vc_ui():
997
+ with gr.Blocks(title="GPT-SoVITS Voice Conversion") as vc_app:
998
+ gr.Markdown("# GPT-SoVITS Voice Conversion")
999
+ gr.Markdown(f"Current Model Version: {model_version}")
1000
+
1001
  with gr.Row():
1002
+ with gr.Column():
1003
+ source_audio = gr.Audio(type="filepath", label="Source Audio (to be converted)")
1004
+ text_input = gr.Textbox(label="Text content of the source audio")
1005
+ language_input = gr.Dropdown(
1006
+ choices=list(dict_language.keys()),
1007
+ value=i18n("中文"),
1008
+ label=i18n("语言 / Language")
 
1009
  )
1010
+ target_audio = gr.Audio(type="filepath", label="Target Voice (reference)")
1011
+
1012
+ with gr.Accordion("Advanced Settings", open=False):
1013
+ with gr.Row():
1014
+ speed = gr.Slider(
1015
+ minimum=0.1, maximum=5, value=1, step=0.1,
1016
+ label=i18n("语速 / Speed")
1017
+ )
1018
+
1019
+ if model_version != "v3":
1020
+ noise_scale = gr.Slider(
1021
+ minimum=0.1, maximum=1.0, value=0.5, step=0.1,
1022
+ label="Noise Scale (V2 models only)"
1023
+ )
1024
+ else:
1025
+ noise_scale = gr.Slider(
1026
+ minimum=0.1, maximum=1.0, value=0.5, step=0.1,
1027
+ label="Noise Scale (ignored for V3)",
1028
+ visible=False
1029
+ )
1030
+
1031
+ if model_version == "v3":
1032
+ sample_steps = gr.Slider(
1033
+ minimum=1, maximum=30, value=8, step=1,
1034
+ label=i18n("采样步数 / Sample Steps")
1035
+ )
1036
+ top_k = gr.Slider(
1037
+ minimum=1, maximum=100, value=20, step=1,
1038
+ label=i18n("Top K")
1039
+ )
1040
+ top_p = gr.Slider(
1041
+ minimum=0.1, maximum=1.0, value=0.6, step=0.1,
1042
+ label=i18n("Top P")
1043
+ )
1044
+ temperature = gr.Slider(
1045
+ minimum=0.1, maximum=1.0, value=0.6, step=0.1,
1046
+ label=i18n("Temperature")
1047
+ )
1048
+ else:
1049
+ sample_steps = gr.Slider(
1050
+ minimum=1, maximum=30, value=8, step=1,
1051
+ label=i18n("采样步数 / Sample Steps"),
1052
+ visible=False
1053
+ )
1054
+ top_k = gr.Slider(
1055
+ minimum=1, maximum=100, value=20, step=1,
1056
+ label=i18n("Top K"),
1057
+ visible=False
1058
+ )
1059
+ top_p = gr.Slider(
1060
+ minimum=0.1, maximum=1.0, value=0.6, step=0.1,
1061
+ label=i18n("Top P"),
1062
+ visible=False
1063
+ )
1064
+ temperature = gr.Slider(
1065
+ minimum=0.1, maximum=1.0, value=0.6, step=0.1,
1066
+ label=i18n("Temperature"),
1067
+ visible=False
1068
+ )
1069
+
1070
+ go_btn = gr.Button(i18n("开始转换 / Start Conversion"), variant="primary")
1071
+
1072
+ with gr.Column():
1073
+ output_audio = gr.Audio(label=i18n("转换后的声音 / Converted Audio"))
1074
+ status_output = gr.Markdown("Ready")
1075
+
1076
+ def process_vc(source_path, text, lang, target_path, noise, k, p, temp, spd, steps):
1077
+ try:
1078
+ if not source_path:
1079
+ return None, "Error: Source audio is required"
1080
+ if not target_path:
1081
+ return None, "Error: Target audio is required"
1082
+ if not text:
1083
+ return None, "Error: Text content is required"
1084
+
1085
+ return vc_main(
1086
+ source_path, text, lang, target_path,
1087
+ noise_scale=noise,
1088
+ top_k=k,
1089
+ top_p=p,
1090
+ temperature=temp,
1091
+ speed=spd,
1092
+ sample_steps=steps
1093
+ ), "Conversion completed successfully"
1094
+ except Exception as e:
1095
+ import traceback
1096
+ return None, f"Error: {str(e)}\n{traceback.format_exc()}"
1097
+
1098
+ go_btn.click(
1099
+ fn=process_vc,
1100
+ inputs=[
1101
+ source_audio, text_input, language_input, target_audio,
1102
+ noise_scale, top_k, top_p, temperature, speed, sample_steps
1103
+ ],
1104
+ outputs=[output_audio, status_output]
1105
  )
1106
+
1107
+ # Launch the app with the infer_ttswebui port + 1 to avoid conflicts
1108
+ vc_app.launch(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1109
  share=True,
 
 
1110
  )
1111
+
1112
+ if __name__ == "__main__":
1113
+ print(f"Launching Voice Conversion UI with model version: {model_version}")
1114
+ launch_vc_ui()