Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from hyper_parameters import tacotron_params as hparams | |
| from training import load_model | |
| from text import text_to_sequence | |
| from melgan.model.generator import Generator | |
| from melgan.utils.hparams import load_hparam | |
| import torch | |
| import numpy as np | |
| torch.manual_seed(1234) | |
| MAX_WAV_VALUE = 32768.0 | |
| def init_models(hparams): | |
| # load trained tacotron2 + GST model: | |
| model = load_model(hparams) | |
| checkpoint_path = "trained_models/checkpoint_78000.model" | |
| model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) | |
| # model.to('cuda') | |
| _ = model.eval() | |
| # load pre trained MelGAN model for mel2audio: | |
| vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt" | |
| checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu") | |
| hp_melgan = load_hparam("melgan/config/default.yaml") | |
| vocoder_model = Generator(80) | |
| vocoder_model.load_state_dict(checkpoint['model_g']) | |
| # vocoder_model = vocoder_model.to('cuda') | |
| vocoder_model.eval(inference=False) | |
| def synthesize(text): | |
| sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] | |
| sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64) | |
| gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35]) | |
| gst_scores = torch.from_numpy(gst_head_scores).float() | |
| mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) | |
| # mel2wav inference: | |
| with torch.no_grad(): | |
| audio = vocoder_model.inference(mel_outputs_postnet) | |
| audio_numpy = audio.data.cpu().detach().numpy() | |
| return (22050, audio_numpy) | |
| init_models(hparams) | |
| iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),]) | |
| iface.launch() | |