Spaces:
Runtime error
Runtime error
| import platform | |
| from functools import partial | |
| from pathlib import Path | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import DataLoader | |
| from tqdm import tqdm | |
| from synthesizer.hparams import hparams_debug_string | |
| from synthesizer.models.tacotron import Tacotron | |
| from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer | |
| from synthesizer.utils import data_parallel_workaround | |
| from synthesizer.utils.symbols import symbols | |
| def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams): | |
| # This generates ground truth-aligned mels for vocoder training | |
| train_in_dir = in_dir.joinpath("train") | |
| train_out_dir = out_dir.joinpath("train") | |
| dev_in_dir = in_dir.joinpath("dev") | |
| dev_out_dir = out_dir.joinpath("dev") | |
| train_synth_dir = train_out_dir / "mels_gta" | |
| train_synth_dir.mkdir(exist_ok=True, parents=True) | |
| dev_synth_dir = dev_out_dir / "mels_gta" | |
| dev_synth_dir.mkdir(exist_ok=True, parents=True) | |
| print(hparams_debug_string()) | |
| # Check for GPU | |
| if torch.cuda.is_available(): | |
| device = torch.device("cuda") | |
| if hparams.synthesis_batch_size % torch.cuda.device_count() != 0: | |
| raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!") | |
| else: | |
| device = torch.device("cpu") | |
| print("Synthesizer using device:", device) | |
| # Instantiate Tacotron model | |
| model = Tacotron(embed_dims=hparams.tts_embed_dims, | |
| num_chars=len(symbols), | |
| encoder_dims=hparams.tts_encoder_dims, | |
| decoder_dims=hparams.tts_decoder_dims, | |
| n_mels=hparams.num_mels, | |
| fft_bins=hparams.num_mels, | |
| postnet_dims=hparams.tts_postnet_dims, | |
| encoder_K=hparams.tts_encoder_K, | |
| lstm_dims=hparams.tts_lstm_dims, | |
| postnet_K=hparams.tts_postnet_K, | |
| num_highways=hparams.tts_num_highways, | |
| dropout=0., # Use zero dropout for gta mels | |
| stop_threshold=hparams.tts_stop_threshold, | |
| speaker_embedding_size=hparams.speaker_embedding_size).to(device) | |
| # Load the weights | |
| print("\nLoading weights at %s" % syn_model_fpath) | |
| model.load(syn_model_fpath) | |
| print("Tacotron weights loaded from step %d" % model.step) | |
| # Synthesize using same reduction factor as the model is currently trained | |
| r = np.int32(model.r) | |
| # Set model to eval mode (disable gradient and zoneout) | |
| model.eval() | |
| # Initialize the dataset | |
| train_metadata_fpath = train_in_dir.joinpath("train.txt") | |
| train_mel_dir = train_in_dir.joinpath("mels") | |
| train_embed_dir = train_in_dir.joinpath("embeds") | |
| dev_metadata_fpath = dev_in_dir.joinpath("dev.txt") | |
| dev_mel_dir = dev_in_dir.joinpath("mels") | |
| dev_embed_dir = dev_in_dir.joinpath("embeds") | |
| train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams) | |
| dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams) | |
| collate_fn = partial(collate_synthesizer, r=r, hparams=hparams) | |
| train_data_loader = DataLoader(train_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2) | |
| dev_data_loader = DataLoader(dev_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2) | |
| # Generate train GTA mels | |
| train_meta_out_fpath = train_out_dir / "synthesized.txt" | |
| with train_meta_out_fpath.open("w") as file: | |
| for i, (texts, mels, embeds, idx) in tqdm(enumerate(train_data_loader), total=len(train_data_loader)): | |
| texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device) | |
| # Parallelize model onto GPUS using workaround due to python bug | |
| # if device.type == "cuda" and torch.cuda.device_count() > 1: | |
| # _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds) | |
| # else: | |
| _, mels_out, _, _ = model(texts, mels, embeds) | |
| for j, k in enumerate(idx): | |
| # Note: outputs mel-spectrogram files and target ones have same names, just different folders | |
| mel_filename = Path(train_synth_dir).joinpath(train_dataset.metadata[k][1]) | |
| mel_out = mels_out[j].detach().cpu().numpy().T | |
| # Use the length of the ground truth mel to remove padding from the generated mels | |
| mel_out = mel_out[:int(train_dataset.metadata[k][4])] | |
| # Write the spectrogram to disk | |
| np.save(mel_filename, mel_out, allow_pickle=False) | |
| # Write metadata into the synthesized file | |
| file.write("|".join(train_dataset.metadata[k])) | |
| # Generate dev GTA mels | |
| dev_meta_out_fpath = dev_out_dir / "synthesized.txt" | |
| with dev_meta_out_fpath.open("w") as file: | |
| for i, (texts, mels, embeds, idx) in tqdm(enumerate(dev_data_loader), total=len(dev_data_loader)): | |
| texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device) | |
| # Parallelize model onto GPUS using workaround due to python bug | |
| # if device.type == "cuda" and torch.cuda.device_count() > 1: | |
| # _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds) | |
| # else: | |
| _, mels_out, _, _ = model(texts, mels, embeds) | |
| for j, k in enumerate(idx): | |
| # Note: outputs mel-spectrogram files and target ones have same names, just different folders | |
| mel_filename = Path(dev_synth_dir).joinpath(dev_dataset.metadata[k][1]) | |
| mel_out = mels_out[j].detach().cpu().numpy().T | |
| # Use the length of the ground truth mel to remove padding from the generated mels | |
| mel_out = mel_out[:int(dev_dataset.metadata[k][4])] | |
| # Write the spectrogram to disk | |
| np.save(mel_filename, mel_out, allow_pickle=False) | |
| # Write metadata into the synthesized file | |
| file.write("|".join(dev_dataset.metadata[k])) | |