Rename GPT_SoVITS/test.py to GPT_SoVITS/vc_webui.py
Browse files- GPT_SoVITS/{test.py → vc_webui.py} +288 -84
GPT_SoVITS/{test.py → vc_webui.py}
RENAMED
@@ -27,13 +27,13 @@ try:
|
|
27 |
import gradio.analytics as analytics
|
28 |
analytics.version_check = lambda:None
|
29 |
except:...
|
30 |
-
version=model_version=
|
31 |
-
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/
|
32 |
-
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/
|
33 |
|
34 |
|
35 |
_ =[[],[]]
|
36 |
-
for i in range(
|
37 |
if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
|
38 |
if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
|
39 |
pretrained_gpt_name,pretrained_sovits_name = _
|
@@ -823,88 +823,292 @@ def html_left(text, label='p'):
|
|
823 |
</div>"""
|
824 |
|
825 |
|
826 |
-
|
827 |
-
|
828 |
-
|
829 |
-
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
838 |
with gr.Row():
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
gr.
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"),
|
847 |
)
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
|
874 |
-
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
-
|
880 |
-
|
881 |
-
|
882 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
883 |
)
|
884 |
-
|
885 |
-
|
886 |
-
|
887 |
-
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
|
888 |
-
# with gr.Row():
|
889 |
-
# text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
|
890 |
-
# button1 = gr.Button(i18n("凑四句一切"), variant="primary")
|
891 |
-
# button2 = gr.Button(i18n("凑50字一切"), variant="primary")
|
892 |
-
# button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
|
893 |
-
# button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
|
894 |
-
# button5 = gr.Button(i18n("按标点符号切"), variant="primary")
|
895 |
-
# text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
|
896 |
-
# button1.click(cut1, [text_inp], [text_opt])
|
897 |
-
# button2.click(cut2, [text_inp], [text_opt])
|
898 |
-
# button3.click(cut3, [text_inp], [text_opt])
|
899 |
-
# button4.click(cut4, [text_inp], [text_opt])
|
900 |
-
# button5.click(cut5, [text_inp], [text_opt])
|
901 |
-
# gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")))
|
902 |
-
|
903 |
-
if __name__ == '__main__':
|
904 |
-
app.queue().launch(#concurrency_count=511, max_size=1022
|
905 |
-
server_name="0.0.0.0",
|
906 |
-
inbrowser=True,
|
907 |
share=True,
|
908 |
-
server_port=8000,
|
909 |
-
quiet=True,
|
910 |
)
|
|
|
|
|
|
|
|
|
|
27 |
import gradio.analytics as analytics
|
28 |
analytics.version_check = lambda:None
|
29 |
except:...
|
30 |
+
version=model_version="v3"
|
31 |
+
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2Gv3.pth"]
|
32 |
+
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1v3.ckpt"]
|
33 |
|
34 |
|
35 |
_ =[[],[]]
|
36 |
+
for i in range(1):
|
37 |
if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
|
38 |
if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
|
39 |
pretrained_gpt_name,pretrained_sovits_name = _
|
|
|
823 |
</div>"""
|
824 |
|
825 |
|
826 |
+
import gradio as gr
|
827 |
+
import torch
|
828 |
+
import torch.nn.functional as F
|
829 |
+
import numpy as np
|
830 |
+
import torchaudio
|
831 |
+
import librosa
|
832 |
+
|
833 |
+
def get_code_from_wav(wav_path):
|
834 |
+
"""Extract codes from input speech audio"""
|
835 |
+
wav16k, sr = librosa.load(wav_path, sr=16000)
|
836 |
+
wav16k = torch.from_numpy(wav16k)
|
837 |
+
if is_half:
|
838 |
+
wav16k = wav16k.half().to(device)
|
839 |
+
else:
|
840 |
+
wav16k = wav16k.to(device)
|
841 |
+
|
842 |
+
# Extract SSL features
|
843 |
+
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)
|
844 |
+
|
845 |
+
# Extract latent codes from SSL features
|
846 |
+
codes = vq_model.extract_latent(ssl_content)
|
847 |
+
|
848 |
+
return codes
|
849 |
+
|
850 |
+
def vc_main(wav_path, text, language, prompt_wav, noise_scale=0.5, top_k=20, top_p=0.6, temperature=0.6, speed=1, sample_steps=8):
|
851 |
+
"""
|
852 |
+
Voice Conversion function that supports both v2 and v3 model versions
|
853 |
+
|
854 |
+
Args:
|
855 |
+
wav_path: Path to source audio for conversion
|
856 |
+
text: Corresponding text for phoneme extraction
|
857 |
+
language: Language of the text
|
858 |
+
prompt_wav: Path to target/reference voice
|
859 |
+
noise_scale: Noise scale for v2 models
|
860 |
+
top_k, top_p, temperature: Parameters for v3 models
|
861 |
+
speed: Speed factor for audio playback
|
862 |
+
sample_steps: Number of sample steps for v3 models
|
863 |
+
|
864 |
+
Returns:
|
865 |
+
Sampling rate and converted audio
|
866 |
+
"""
|
867 |
+
# Get language format
|
868 |
+
language = dict_language[language]
|
869 |
+
|
870 |
+
# Get phones from text
|
871 |
+
phones, word2ph, norm_text = clean_text_inf(text, language, version)
|
872 |
+
|
873 |
+
# Get reference audio spectrogram
|
874 |
+
refer = get_spepc(hps, prompt_wav).to(dtype).to(device)
|
875 |
+
|
876 |
+
# Get codes from source audio
|
877 |
+
source_codes = get_code_from_wav(wav_path)
|
878 |
+
|
879 |
+
if model_version != "v3":
|
880 |
+
# V1/V2 models voice conversion logic
|
881 |
+
ge = vq_model.ref_enc(refer) # [B, D, T/1]
|
882 |
+
quantized = vq_model.quantizer.decode(source_codes[None, None]) # [B, D, T]
|
883 |
+
|
884 |
+
# Interpolate if necessary for 25hz models
|
885 |
+
if hps.model.semantic_frame_rate == "25hz":
|
886 |
+
quantized = F.interpolate(
|
887 |
+
quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
|
888 |
+
)
|
889 |
+
|
890 |
+
m_p, logs_p, y_mask = vq_model.enc_p(
|
891 |
+
quantized,
|
892 |
+
torch.LongTensor([quantized.shape[-1]]).to(device),
|
893 |
+
torch.LongTensor(phones).to(device).unsqueeze(0),
|
894 |
+
torch.LongTensor([len(phones)]).to(device),
|
895 |
+
ge
|
896 |
+
)
|
897 |
+
|
898 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
899 |
+
z = vq_model.flow(z_p, y_mask, g=ge, reverse=True)
|
900 |
+
o = vq_model.dec((z * y_mask)[:, :, :], g=ge) # [B, D=1, T], torch.float32 (-1, 1)
|
901 |
+
audio = o.detach().cpu().numpy()[0, 0]
|
902 |
+
|
903 |
+
else:
|
904 |
+
# V3 model voice conversion logic
|
905 |
+
if model is None:
|
906 |
+
init_bigvgan()
|
907 |
+
|
908 |
+
# For v3 models, inspect shape and prepare correctly
|
909 |
+
# The shape problem is in how the codes are being passed to decode_encp
|
910 |
+
|
911 |
+
# The error is: "b n d -> b d n" expects 3D tensor but got 4D: [1, 1, 225, 768]
|
912 |
+
# This suggests source_codes may have a shape like [225, 768] or [1, 225, 768]
|
913 |
+
|
914 |
+
# Prepare the semantic tensor for v3, ensuring it has the correct shape
|
915 |
+
if source_codes.dim() == 3: # If [B, T, D]
|
916 |
+
semantic = source_codes
|
917 |
+
elif source_codes.dim() == 2: # If [T, D]
|
918 |
+
semantic = source_codes.unsqueeze(0) # Add batch dimension [1, T, D]
|
919 |
+
else:
|
920 |
+
# Handle unexpected shapes
|
921 |
+
raise ValueError(f"Unexpected source_codes shape: {source_codes.shape}")
|
922 |
+
|
923 |
+
# Prepare phoneme IDs
|
924 |
+
phoneme_ids = torch.LongTensor(phones).to(device).unsqueeze(0)
|
925 |
+
|
926 |
+
# Get reference audio features and global embedding
|
927 |
+
fea_ref, ge = vq_model.decode_encp(semantic, phoneme_ids, refer)
|
928 |
+
|
929 |
+
# Load and process reference audio
|
930 |
+
ref_audio, sr = torchaudio.load(prompt_wav)
|
931 |
+
ref_audio = ref_audio.to(device).float()
|
932 |
+
if ref_audio.shape[0] == 2: # Convert stereo to mono
|
933 |
+
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
934 |
+
if sr != 24000:
|
935 |
+
ref_audio = resample(ref_audio, sr)
|
936 |
+
|
937 |
+
# Convert to mel spectrogram and normalize
|
938 |
+
mel2 = mel_fn(ref_audio.to(dtype))
|
939 |
+
mel2 = norm_spec(mel2)
|
940 |
+
|
941 |
+
# Adjust time dimensions
|
942 |
+
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
943 |
+
mel2 = mel2[:, :, :T_min]
|
944 |
+
fea_ref = fea_ref[:, :, :T_min]
|
945 |
+
|
946 |
+
if T_min > 468:
|
947 |
+
mel2 = mel2[:, :, -468:]
|
948 |
+
fea_ref = fea_ref[:, :, -468:]
|
949 |
+
T_min = 468
|
950 |
+
|
951 |
+
# Process source audio features with phoneme conditioning
|
952 |
+
fea_todo, ge = vq_model.decode_encp(semantic, phoneme_ids, refer, ge)
|
953 |
+
|
954 |
+
# Process audio in chunks
|
955 |
+
chunk_len = 934 - T_min
|
956 |
+
cfm_resss = []
|
957 |
+
idx = 0
|
958 |
+
|
959 |
+
while True:
|
960 |
+
fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len]
|
961 |
+
if fea_todo_chunk.shape[-1] == 0:
|
962 |
+
break
|
963 |
+
|
964 |
+
idx += chunk_len
|
965 |
+
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
|
966 |
+
cfm_res = vq_model.cfm.inference(
|
967 |
+
fea,
|
968 |
+
torch.LongTensor([fea.size(1)]).to(fea.device),
|
969 |
+
mel2,
|
970 |
+
sample_steps,
|
971 |
+
inference_cfg_rate=0
|
972 |
+
)
|
973 |
+
|
974 |
+
cfm_res = cfm_res[:, :, mel2.shape[2]:]
|
975 |
+
mel2 = cfm_res[:, :, -T_min:]
|
976 |
+
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
977 |
+
cfm_resss.append(cfm_res)
|
978 |
+
|
979 |
+
# Concatenate results and convert to audio
|
980 |
+
cmf_res = torch.cat(cfm_resss, 2)
|
981 |
+
cmf_res = denorm_spec(cmf_res)
|
982 |
+
|
983 |
+
with torch.inference_mode():
|
984 |
+
wav_gen = model(cmf_res)
|
985 |
+
audio = wav_gen[0][0].cpu().detach().numpy()
|
986 |
+
|
987 |
+
# Normalize audio to prevent clipping
|
988 |
+
max_audio = np.abs(audio).max()
|
989 |
+
if max_audio > 1:
|
990 |
+
audio /= max_audio
|
991 |
+
|
992 |
+
sr = hps.data.sampling_rate if model_version != "v3" else 24000
|
993 |
+
return sr, (audio * 32768).astype(np.int16)
|
994 |
+
|
995 |
+
# Create and launch the standalone Gradio interface for voice conversion
|
996 |
+
def launch_vc_ui():
|
997 |
+
with gr.Blocks(title="GPT-SoVITS Voice Conversion") as vc_app:
|
998 |
+
gr.Markdown("# GPT-SoVITS Voice Conversion")
|
999 |
+
gr.Markdown(f"Current Model Version: {model_version}")
|
1000 |
+
|
1001 |
with gr.Row():
|
1002 |
+
with gr.Column():
|
1003 |
+
source_audio = gr.Audio(type="filepath", label="Source Audio (to be converted)")
|
1004 |
+
text_input = gr.Textbox(label="Text content of the source audio")
|
1005 |
+
language_input = gr.Dropdown(
|
1006 |
+
choices=list(dict_language.keys()),
|
1007 |
+
value=i18n("中文"),
|
1008 |
+
label=i18n("语言 / Language")
|
|
|
1009 |
)
|
1010 |
+
target_audio = gr.Audio(type="filepath", label="Target Voice (reference)")
|
1011 |
+
|
1012 |
+
with gr.Accordion("Advanced Settings", open=False):
|
1013 |
+
with gr.Row():
|
1014 |
+
speed = gr.Slider(
|
1015 |
+
minimum=0.1, maximum=5, value=1, step=0.1,
|
1016 |
+
label=i18n("语速 / Speed")
|
1017 |
+
)
|
1018 |
+
|
1019 |
+
if model_version != "v3":
|
1020 |
+
noise_scale = gr.Slider(
|
1021 |
+
minimum=0.1, maximum=1.0, value=0.5, step=0.1,
|
1022 |
+
label="Noise Scale (V2 models only)"
|
1023 |
+
)
|
1024 |
+
else:
|
1025 |
+
noise_scale = gr.Slider(
|
1026 |
+
minimum=0.1, maximum=1.0, value=0.5, step=0.1,
|
1027 |
+
label="Noise Scale (ignored for V3)",
|
1028 |
+
visible=False
|
1029 |
+
)
|
1030 |
+
|
1031 |
+
if model_version == "v3":
|
1032 |
+
sample_steps = gr.Slider(
|
1033 |
+
minimum=1, maximum=30, value=8, step=1,
|
1034 |
+
label=i18n("采样步数 / Sample Steps")
|
1035 |
+
)
|
1036 |
+
top_k = gr.Slider(
|
1037 |
+
minimum=1, maximum=100, value=20, step=1,
|
1038 |
+
label=i18n("Top K")
|
1039 |
+
)
|
1040 |
+
top_p = gr.Slider(
|
1041 |
+
minimum=0.1, maximum=1.0, value=0.6, step=0.1,
|
1042 |
+
label=i18n("Top P")
|
1043 |
+
)
|
1044 |
+
temperature = gr.Slider(
|
1045 |
+
minimum=0.1, maximum=1.0, value=0.6, step=0.1,
|
1046 |
+
label=i18n("Temperature")
|
1047 |
+
)
|
1048 |
+
else:
|
1049 |
+
sample_steps = gr.Slider(
|
1050 |
+
minimum=1, maximum=30, value=8, step=1,
|
1051 |
+
label=i18n("采样步数 / Sample Steps"),
|
1052 |
+
visible=False
|
1053 |
+
)
|
1054 |
+
top_k = gr.Slider(
|
1055 |
+
minimum=1, maximum=100, value=20, step=1,
|
1056 |
+
label=i18n("Top K"),
|
1057 |
+
visible=False
|
1058 |
+
)
|
1059 |
+
top_p = gr.Slider(
|
1060 |
+
minimum=0.1, maximum=1.0, value=0.6, step=0.1,
|
1061 |
+
label=i18n("Top P"),
|
1062 |
+
visible=False
|
1063 |
+
)
|
1064 |
+
temperature = gr.Slider(
|
1065 |
+
minimum=0.1, maximum=1.0, value=0.6, step=0.1,
|
1066 |
+
label=i18n("Temperature"),
|
1067 |
+
visible=False
|
1068 |
+
)
|
1069 |
+
|
1070 |
+
go_btn = gr.Button(i18n("开始转换 / Start Conversion"), variant="primary")
|
1071 |
+
|
1072 |
+
with gr.Column():
|
1073 |
+
output_audio = gr.Audio(label=i18n("转换后的声音 / Converted Audio"))
|
1074 |
+
status_output = gr.Markdown("Ready")
|
1075 |
+
|
1076 |
+
def process_vc(source_path, text, lang, target_path, noise, k, p, temp, spd, steps):
|
1077 |
+
try:
|
1078 |
+
if not source_path:
|
1079 |
+
return None, "Error: Source audio is required"
|
1080 |
+
if not target_path:
|
1081 |
+
return None, "Error: Target audio is required"
|
1082 |
+
if not text:
|
1083 |
+
return None, "Error: Text content is required"
|
1084 |
+
|
1085 |
+
return vc_main(
|
1086 |
+
source_path, text, lang, target_path,
|
1087 |
+
noise_scale=noise,
|
1088 |
+
top_k=k,
|
1089 |
+
top_p=p,
|
1090 |
+
temperature=temp,
|
1091 |
+
speed=spd,
|
1092 |
+
sample_steps=steps
|
1093 |
+
), "Conversion completed successfully"
|
1094 |
+
except Exception as e:
|
1095 |
+
import traceback
|
1096 |
+
return None, f"Error: {str(e)}\n{traceback.format_exc()}"
|
1097 |
+
|
1098 |
+
go_btn.click(
|
1099 |
+
fn=process_vc,
|
1100 |
+
inputs=[
|
1101 |
+
source_audio, text_input, language_input, target_audio,
|
1102 |
+
noise_scale, top_k, top_p, temperature, speed, sample_steps
|
1103 |
+
],
|
1104 |
+
outputs=[output_audio, status_output]
|
1105 |
)
|
1106 |
+
|
1107 |
+
# Launch the app with the infer_ttswebui port + 1 to avoid conflicts
|
1108 |
+
vc_app.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1109 |
share=True,
|
|
|
|
|
1110 |
)
|
1111 |
+
|
1112 |
+
if __name__ == "__main__":
|
1113 |
+
print(f"Launching Voice Conversion UI with model version: {model_version}")
|
1114 |
+
launch_vc_ui()
|