Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

Tacotron2_GST_eng / app.py

AlexK-PL

Update app.py

261114e about 2 years ago

raw

history blame

1.83 kB

	import gradio as gr

	from hyper_parameters import tacotron_params as hparams
	from training import load_model

	from text import text_to_sequence

	from melgan.model.generator import Generator
	from melgan.utils.hparams import load_hparam

	import torch
	import numpy as np

	torch.manual_seed(1234)
	MAX_WAV_VALUE = 32768.0

	def init_models(hparams):
	# load trained tacotron2 + GST model:
	model = load_model(hparams)
	checkpoint_path = "trained_models/checkpoint_78000.model"
	model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
	# model.to('cuda')
	_ = model.eval()

	# load pre trained MelGAN model for mel2audio:
	vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
	checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
	hp_melgan = load_hparam("melgan/config/default.yaml")
	vocoder_model = Generator(80)
	vocoder_model.load_state_dict(checkpoint['model_g'])
	# vocoder_model = vocoder_model.to('cuda')
	vocoder_model.eval(inference=False)

	def synthesize(text):
	sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
	sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)

	gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
	gst_scores = torch.from_numpy(gst_head_scores).float()

	mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)

	# mel2wav inference:
	with torch.no_grad():
	audio = vocoder_model.inference(mel_outputs_postnet)

	audio_numpy = audio.data.cpu().detach().numpy()

	return (22050, audio_numpy)


	init_models(hparams)
	iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),])
	iface.launch()