kevinwang676
/

GPT-SoVITS-v3

ONNX

Model card Files Files and versions Community

kevinwang676 commited on Mar 1

Commit

3a8af5e

verified ·

1 Parent(s): bec574b

Rename GPT_SoVITS/test.py to GPT_SoVITS/vc_webui.py

Browse files

Files changed (1) hide show

GPT_SoVITS/{test.py → vc_webui.py} +288 -84

GPT_SoVITS/{test.py → vc_webui.py} RENAMED Viewed

@@ -27,13 +27,13 @@ try:
     import gradio.analytics as analytics
     analytics.version_check = lambda:None
 except:...
-version=model_version=os.environ.get("version","v2")
-pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth","GPT_SoVITS/pretrained_models/s2Gv3.pth"]
-pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"]
 _ =[[],[]]
-for i in range(3):
     if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
     if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
 pretrained_gpt_name,pretrained_sovits_name = _
@@ -823,88 +823,292 @@ def html_left(text, label='p'):
                 </div>"""
-with gr.Blocks(title="GPT-SoVITS WebUI") as app:
-    gr.Markdown(
-        value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
-    )
-    with gr.Group():
-        gr.Markdown(html_center(i18n("模型切换"),'h3'))
-        with gr.Row():
-            GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True, scale=14)
-            SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True, scale=14)
-            refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14)
-            refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
-        gr.Markdown(html_center(i18n("*请上传并填写参考信息"),'h3'))
         with gr.Row():
-            inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频，超过会报错！"), type="filepath", scale=13)
-            with gr.Column(scale=13):
-                ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式，使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
-                gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT，听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。")))
-                prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
-            with gr.Column(scale=14):
-                prompt_language = gr.Dropdown(
-                    label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"),
                 )
-                inp_refs = gr.File(label=i18n("可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"),file_count="multiple")if model_version!="v3"else gr.File(label=i18n("可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"),file_count="multiple",visible=False)
-                sample_steps = gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=64,choices=[4,8,16,32,64],visible=True)if model_version=="v3"else gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=8,choices=[4,8,16,32],visible=False)
-        gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3'))
-        with gr.Row():
-            with gr.Column(scale=13):
-                text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26)
-            with gr.Column(scale=7):
-                text_language = gr.Dropdown(
-                        label=i18n("需要合成的语种")+i18n(".限制范围越小判别效果越好。"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1
-                    )
-                how_to_cut = gr.Dropdown(
-                        label=i18n("怎么切"),
-                        choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
-                        value=i18n("凑四句一切"),
-                        interactive=True, scale=1
-                    )
-                gr.Markdown(value=html_center(i18n("语速调整，高为更快")))
-                if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), value=False, interactive=True,show_label=True, scale=1)
-                speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True, scale=1)
-                gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认)：")))
-                top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=15,interactive=True, scale=1)
-                top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True, scale=1)
-                temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True,  scale=1)
-            # with gr.Column():
-            #     gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理，无视目标文本框。"))
-            #     phoneme=gr.Textbox(label=i18n("音素框"), value="")
-            #     get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
-        with gr.Row():
-            inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
-            output = gr.Audio(label=i18n("输出的语音"), scale=14)
-        inference_button.click(
-            get_tts_wav,
-            [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps],
-            [output],
         )
-        SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free])
-        GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
-        # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
-        # with gr.Row():
-        #     text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
-        #     button1 = gr.Button(i18n("凑四句一切"), variant="primary")
-        #     button2 = gr.Button(i18n("凑50字一切"), variant="primary")
-        #     button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
-        #     button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
-        #     button5 = gr.Button(i18n("按标点符号切"), variant="primary")
-        #     text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
-        #     button1.click(cut1, [text_inp], [text_opt])
-        #     button2.click(cut2, [text_inp], [text_opt])
-        #     button3.click(cut3, [text_inp], [text_opt])
-        #     button4.click(cut4, [text_inp], [text_opt])
-        #     button5.click(cut5, [text_inp], [text_opt])
-        # gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")))
-if __name__ == '__main__':
-    app.queue().launch(#concurrency_count=511, max_size=1022
-        server_name="0.0.0.0",
-        inbrowser=True,
         share=True,
-        server_port=8000,
-        quiet=True,
     )

     import gradio.analytics as analytics
     analytics.version_check = lambda:None
 except:...
+version=model_version="v3"
+pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2Gv3.pth"]
+pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1v3.ckpt"]
 _ =[[],[]]
+for i in range(1):
     if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
     if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
 pretrained_gpt_name,pretrained_sovits_name = _
                 </div>"""
+import gradio as gr
+import torch
+import torch.nn.functional as F
+import numpy as np
+import torchaudio
+import librosa
+def get_code_from_wav(wav_path):
+    """Extract codes from input speech audio"""
+    wav16k, sr = librosa.load(wav_path, sr=16000)
+    wav16k = torch.from_numpy(wav16k)
+    if is_half:
+        wav16k = wav16k.half().to(device)
+    else:
+        wav16k = wav16k.to(device)
+    # Extract SSL features
+    ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)
+    # Extract latent codes from SSL features
+    codes = vq_model.extract_latent(ssl_content)
+    return codes
+def vc_main(wav_path, text, language, prompt_wav, noise_scale=0.5, top_k=20, top_p=0.6, temperature=0.6, speed=1, sample_steps=8):
+    """
+    Voice Conversion function that supports both v2 and v3 model versions
+    Args:
+        wav_path: Path to source audio for conversion
+        text: Corresponding text for phoneme extraction
+        language: Language of the text
+        prompt_wav: Path to target/reference voice
+        noise_scale: Noise scale for v2 models
+        top_k, top_p, temperature: Parameters for v3 models
+        speed: Speed factor for audio playback
+        sample_steps: Number of sample steps for v3 models
+    Returns:
+        Sampling rate and converted audio
+    """
+    # Get language format
+    language = dict_language[language]
+    # Get phones from text
+    phones, word2ph, norm_text = clean_text_inf(text, language, version)
+    # Get reference audio spectrogram
+    refer = get_spepc(hps, prompt_wav).to(dtype).to(device)
+    # Get codes from source audio
+    source_codes = get_code_from_wav(wav_path)
+    if model_version != "v3":
+        # V1/V2 models voice conversion logic
+        ge = vq_model.ref_enc(refer)  # [B, D, T/1]
+        quantized = vq_model.quantizer.decode(source_codes[None, None])  # [B, D, T]
+        # Interpolate if necessary for 25hz models
+        if hps.model.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(
+                quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
+            )
+        m_p, logs_p, y_mask = vq_model.enc_p(
+            quantized,
+            torch.LongTensor([quantized.shape[-1]]).to(device),
+            torch.LongTensor(phones).to(device).unsqueeze(0),
+            torch.LongTensor([len(phones)]).to(device),
+            ge
+        )
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        z = vq_model.flow(z_p, y_mask, g=ge, reverse=True)
+        o = vq_model.dec((z * y_mask)[:, :, :], g=ge)  # [B, D=1, T], torch.float32 (-1, 1)
+        audio = o.detach().cpu().numpy()[0, 0]
+    else:
+        # V3 model voice conversion logic
+        if model is None:
+            init_bigvgan()
+        # For v3 models, inspect shape and prepare correctly
+        # The shape problem is in how the codes are being passed to decode_encp
+        # The error is: "b n d -> b d n" expects 3D tensor but got 4D: [1, 1, 225, 768]
+        # This suggests source_codes may have a shape like [225, 768] or [1, 225, 768]
+        # Prepare the semantic tensor for v3, ensuring it has the correct shape
+        if source_codes.dim() == 3:  # If [B, T, D]
+            semantic = source_codes
+        elif source_codes.dim() == 2:  # If [T, D]
+            semantic = source_codes.unsqueeze(0)  # Add batch dimension [1, T, D]
+        else:
+            # Handle unexpected shapes
+            raise ValueError(f"Unexpected source_codes shape: {source_codes.shape}")
+        # Prepare phoneme IDs
+        phoneme_ids = torch.LongTensor(phones).to(device).unsqueeze(0)
+        # Get reference audio features and global embedding
+        fea_ref, ge = vq_model.decode_encp(semantic, phoneme_ids, refer)
+        # Load and process reference audio
+        ref_audio, sr = torchaudio.load(prompt_wav)
+        ref_audio = ref_audio.to(device).float()
+        if ref_audio.shape[0] == 2:  # Convert stereo to mono
+            ref_audio = ref_audio.mean(0).unsqueeze(0)
+        if sr != 24000:
+            ref_audio = resample(ref_audio, sr)
+        # Convert to mel spectrogram and normalize
+        mel2 = mel_fn(ref_audio.to(dtype))
+        mel2 = norm_spec(mel2)
+        # Adjust time dimensions
+        T_min = min(mel2.shape[2], fea_ref.shape[2])
+        mel2 = mel2[:, :, :T_min]
+        fea_ref = fea_ref[:, :, :T_min]
+        if T_min > 468:
+            mel2 = mel2[:, :, -468:]
+            fea_ref = fea_ref[:, :, -468:]
+            T_min = 468
+        # Process source audio features with phoneme conditioning
+        fea_todo, ge = vq_model.decode_encp(semantic, phoneme_ids, refer, ge)
+        # Process audio in chunks
+        chunk_len = 934 - T_min
+        cfm_resss = []
+        idx = 0
+        while True:
+            fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len]
+            if fea_todo_chunk.shape[-1] == 0:
+                break
+            idx += chunk_len
+            fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
+            cfm_res = vq_model.cfm.inference(
+                fea,
+                torch.LongTensor([fea.size(1)]).to(fea.device),
+                mel2,
+                sample_steps,
+                inference_cfg_rate=0
+            )
+            cfm_res = cfm_res[:, :, mel2.shape[2]:]
+            mel2 = cfm_res[:, :, -T_min:]
+            fea_ref = fea_todo_chunk[:, :, -T_min:]
+            cfm_resss.append(cfm_res)
+        # Concatenate results and convert to audio
+        cmf_res = torch.cat(cfm_resss, 2)
+        cmf_res = denorm_spec(cmf_res)
+        with torch.inference_mode():
+            wav_gen = model(cmf_res)
+            audio = wav_gen[0][0].cpu().detach().numpy()
+    # Normalize audio to prevent clipping
+    max_audio = np.abs(audio).max()
+    if max_audio > 1:
+        audio /= max_audio
+    sr = hps.data.sampling_rate if model_version != "v3" else 24000
+    return sr, (audio * 32768).astype(np.int16)
+# Create and launch the standalone Gradio interface for voice conversion
+def launch_vc_ui():
+    with gr.Blocks(title="GPT-SoVITS Voice Conversion") as vc_app:
+        gr.Markdown("# GPT-SoVITS Voice Conversion")
+        gr.Markdown(f"Current Model Version: {model_version}")
         with gr.Row():
+            with gr.Column():
+                source_audio = gr.Audio(type="filepath", label="Source Audio (to be converted)")
+                text_input = gr.Textbox(label="Text content of the source audio")
+                language_input = gr.Dropdown(
+                    choices=list(dict_language.keys()),
+                    value=i18n("中文"),
+                    label=i18n("语言 / Language")
                 )
+                target_audio = gr.Audio(type="filepath", label="Target Voice (reference)")
+                with gr.Accordion("Advanced Settings", open=False):
+                    with gr.Row():
+                        speed = gr.Slider(
+                            minimum=0.1, maximum=5, value=1, step=0.1,
+                            label=i18n("语速 / Speed")
+                        )
+                    if model_version != "v3":
+                        noise_scale = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.5, step=0.1,
+                            label="Noise Scale (V2 models only)"
+                        )
+                    else:
+                        noise_scale = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.5, step=0.1,
+                            label="Noise Scale (ignored for V3)",
+                            visible=False
+                        )
+                    if model_version == "v3":
+                        sample_steps = gr.Slider(
+                            minimum=1, maximum=30, value=8, step=1,
+                            label=i18n("采样步数 / Sample Steps")
+                        )
+                        top_k = gr.Slider(
+                            minimum=1, maximum=100, value=20, step=1,
+                            label=i18n("Top K")
+                        )
+                        top_p = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.6, step=0.1,
+                            label=i18n("Top P")
+                        )
+                        temperature = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.6, step=0.1,
+                            label=i18n("Temperature")
+                        )
+                    else:
+                        sample_steps = gr.Slider(
+                            minimum=1, maximum=30, value=8, step=1,
+                            label=i18n("采样步数 / Sample Steps"),
+                            visible=False
+                        )
+                        top_k = gr.Slider(
+                            minimum=1, maximum=100, value=20, step=1,
+                            label=i18n("Top K"),
+                            visible=False
+                        )
+                        top_p = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.6, step=0.1,
+                            label=i18n("Top P"),
+                            visible=False
+                        )
+                        temperature = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.6, step=0.1,
+                            label=i18n("Temperature"),
+                            visible=False
+                        )
+                go_btn = gr.Button(i18n("开始转换 / Start Conversion"), variant="primary")
+            with gr.Column():
+                output_audio = gr.Audio(label=i18n("转换后的声音 / Converted Audio"))
+                status_output = gr.Markdown("Ready")
+        def process_vc(source_path, text, lang, target_path, noise, k, p, temp, spd, steps):
+            try:
+                if not source_path:
+                    return None, "Error: Source audio is required"
+                if not target_path:
+                    return None, "Error: Target audio is required"
+                if not text:
+                    return None, "Error: Text content is required"
+                return vc_main(
+                    source_path, text, lang, target_path,
+                    noise_scale=noise,
+                    top_k=k,
+                    top_p=p,
+                    temperature=temp,
+                    speed=spd,
+                    sample_steps=steps
+                ), "Conversion completed successfully"
+            except Exception as e:
+                import traceback
+                return None, f"Error: {str(e)}\n{traceback.format_exc()}"
+        go_btn.click(
+            fn=process_vc,
+            inputs=[
+                source_audio, text_input, language_input, target_audio,
+                noise_scale, top_k, top_p, temperature, speed, sample_steps
+            ],
+            outputs=[output_audio, status_output]
         )
+    # Launch the app with the infer_ttswebui port + 1 to avoid conflicts
+    vc_app.launch(
         share=True,
     )
+if __name__ == "__main__":
+    print(f"Launching Voice Conversion UI with model version: {model_version}")
+    launch_vc_ui()