File size: 3,346 Bytes
1785140
1945c48
1785140
 
 
b865d16
 
a627d55
b865d16
 
 
a627d55
1785140
1945c48
a627d55
3cc5048
1785140
a627d55
 
 
 
b865d16
a627d55
b865d16
 
 
 
a627d55
b865d16
 
 
 
 
 
 
 
 
a627d55
b865d16
a627d55
 
 
 
b865d16
 
 
a627d55
33b51a6
b865d16
 
 
a627d55
 
 
 
 
 
b865d16
 
 
 
 
a627d55
b865d16
 
 
 
 
 
 
 
 
 
a627d55
a7161ed
 
b865d16
a627d55
6f17cca
b865d16
a7161ed
 
 
1945c48
a627d55
1945c48
b865d16
6f17cca
 
1945c48
 
a627d55
1945c48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import spaces
import gradio as gr
import torch
from TTS.api import TTS
import os
import tempfile
import torchaudio
from huggingface_hub import hf_hub_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Aceptar los t茅rminos de COQUI
os.environ["COQUI_TOS_AGREED"] = "1"

# Definir el dispositivo como CPU
device = "cpu"

# Descargar archivos desde HuggingFace
model_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="model.pth")
config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")

# Funci贸n para limpiar la cach茅 de GPU (no necesaria para CPU, pero la mantengo por si en el futuro usas GPU)
def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Cargar el modelo XTTS
XTTS_MODEL = None
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
    global XTTS_MODEL
    clear_gpu_cache()
    if not xtts_checkpoint or not xtts_config or not xtts_vocab:
        return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
    config = XttsConfig()
    config.load_json(xtts_config)
    XTTS_MODEL = Xtts.init_from_config(config)
    print("Loading XTTS model!")
    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
    
    # No mover a GPU ya que usamos CPU
    # if torch.cuda.is_available():
    #     XTTS_MODEL.cuda()

    print("Model Loaded!")

# Funci贸n para ejecutar TTS
def run_tts(lang, tts_text, speaker_audio_file):
    if XTTS_MODEL is None or not speaker_audio_file:
        return "You need to run the previous step to load the model !!", None, None

    gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
        audio_path=speaker_audio_file, 
        gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, 
        max_ref_length=XTTS_MODEL.config.max_ref_len, 
        sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
    )
    out = XTTS_MODEL.inference(
        text=tts_text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=XTTS_MODEL.config.temperature,
        length_penalty=XTTS_MODEL.config.length_penalty,
        repetition_penalty=XTTS_MODEL.config.repetition_penalty,
        top_k=XTTS_MODEL.config.top_k,
        top_p=XTTS_MODEL.config.top_p,
    )

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
        out_path = fp.name
        torchaudio.save(out_path, out["wav"], 24000)
    print("Speech generated!")

    return out_path, speaker_audio_file

# Definir la funci贸n para Gradio
@spaces.GPU(enable_queue=True)
def generate(text, audio):
    load_model(model_path, config_path, vocab_path)
    out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio)
    return out_path

# Configurar la interfaz de Gradio
demo = gr.Interface(
    fn=generate, 
    inputs=[gr.Textbox(label='Frase a generar'), gr.Audio(type='filepath', label='Voz de referencia')], 
    outputs=gr.Audio(type='filepath')
)

# Lanzar la interfaz
demo.launch()