Spaces:

0xrushi
/

Priyanka-Chopra-TTS

Build error

+from flask import Flask, redirect, url_for, request
+import gradio as gr
+from synthesize import synthesize, load_model
+from synthesis.vocoders import Hifigan
+import sounddevice as sd
+import soundfile as sf
+model = load_model("checkpoints/checkpoint_9000.zip")
+vocoder = Hifigan("weights/custom_pctest/model.pt", "weights/custom_pctest/config.json")
+def inference(text: str):
+    synthesize(
+        model=model,
+        text=text,
+        graph_path="graph.png",
+        audio_path="audio.wav",
+        vocoder=vocoder,
+    )
+    return "audio.wav"
+app = Flask(__name__)
+@app.route('/process',methods = ['POST'])
+def login():
+   if request.method == 'POST':
+        text = request.json['text']
+        inference(text)
+        data, fs = sf.read("audio.wav", dtype='float32')
+        sd.play(data, fs)
+        status = sd.wait()  # Wait until file is done playing
+        return {'success': True}
+if __name__ == '__main__':
+   app.run(debug = True)

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import tempfile
+import gradio as gr
+from synthesize import synthesize, load_model
+from synthesis.vocoders import Hifigan
+model = load_model("checkpoints/checkpoint_9000.zip")
+vocoder = Hifigan("weights/custom_pctest/model.pt", "weights/custom_pctest/config.json")
+title = "Text-to-Speech (TTS) model for Priyanka Chopra's voice"
+description = "Generate english speech from text using a Tacotron2 model" \
+article = """<p style='text-align: center'>
+                <a href='https://rushichaudhari.github.io/posts/2022-01-12-lets-clone-the-voice-of-priyanka-chopra-jonas/'
+                target='blank'
+                class='footer'>Blog</a> |
+                <a href='https://github.com/eugenesiow/practical-ml' target='_blank'
+                class='footer'>Github Repo</a></p>"""
+examples = ["Generate english speech from text using a Tacotron2 model.",
+            ""]
+def inference(text: str):
+    synthesize(
+        model=model,
+        text=text,
+        graph_path="graph.png",
+        audio_path="audio.wav",
+        vocoder=vocoder,
+    )
+    return "audio.wav"
+gr.Interface(
+    fn=inference,
+    inputs=[
+        gr.inputs.Textbox(
+            label="Input",
+            default="你好吗？我很好。",
+        ),
+    ],
+    outputs=gr.outputs.Audio(label="Output"),
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+    enable_queue=True,
+    allow_flagging=False,
+    ).launch(debug=False)

checkpoints/checkpoint_0.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18607124e1a417d9cb60f8d52c360cfda530ead09c1bb5940c2e8b3c9fcd10d1
+size 338411959

checkpoints/checkpoint_9000.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ab42a5e478da1131a4eeed118addb9dbe945d747068ea3cb1cb1ef4584a468b
+size 338412023

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 'name','output','flag','username','timestamp'
2	+ '','','','','2022-05-17 13:28:37.822200'

synthesis/synthesize.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import argparse
+import os
+import matplotlib.pyplot as plt
+import torch
+import numpy as np
+import matplotlib
+from scipy.io.wavfile import write
+from os.path import dirname, abspath
+import sys
+import nltk
+nltk.download("punkt")
+sys.path.append(dirname(dirname(abspath(__file__))))
+matplotlib.use("Agg")
+from training.tacotron2_model import Tacotron2
+from training.clean_text import clean_text
+from training import DEFAULT_ALPHABET
+from synthesis.vocoders import Hifigan
+def load_model(model_path):
+    """
+    Loads the Tacotron2 model.
+    Uses GPU if available, otherwise uses CPU.
+    Parameters
+    ----------
+    model_path : str
+        Path to tacotron2 model
+    Returns
+    -------
+    Tacotron2
+        Loaded tacotron2 model
+    """
+    if torch.cuda.is_available():
+        model = Tacotron2().cuda()
+        model.load_state_dict(torch.load(model_path)["state_dict"])
+        _ = model.cuda().eval().half()
+    else:
+        model = Tacotron2()
+        model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu"))["state_dict"])
+    return model
+def generate_graph(alignments, filepath, heading=""):
+    """
+    Generates synthesis alignment graph image.
+    Parameters
+    ----------
+    alignments : list
+        Numpy alignment data
+    filepath : str
+        Path to save image to
+    heading : str (optional)
+        Graph heading
+    """
+    data = alignments.float().data.cpu().numpy()[0].T
+    plt.imshow(data, aspect="auto", origin="lower", interpolation="none")
+    if heading:
+        plt.title(heading)
+    plt.savefig(filepath)
+def text_to_sequence(text, symbols):
+    """
+    Generates text sequence for audio file
+    Parameters
+    ----------
+    text : str
+        Text to synthesize
+    symbols : list
+        List of valid symbols
+    """
+    symbol_to_id = {s: i for i, s in enumerate(symbols)}
+    sequence = np.array([[symbol_to_id[s] for s in text if s in symbol_to_id]])
+    if torch.cuda.is_available():
+        return torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
+    else:
+        return torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()
+def join_alignment_graphs(alignments):
+    """
+    Joins multiple alignment graphs.
+    Parameters
+    ----------
+    alignments : list
+        List of alignment Tensors
+    Returns
+    -------
+    Tensor
+        Combined alignment tensor
+    """
+    alignment_sizes = [a.size() for a in alignments]
+    joined = torch.zeros((1, sum([a[1] for a in alignment_sizes]), sum([a[2] for a in alignment_sizes])))
+    current_x = 0
+    current_y = 0
+    for alignment in alignments:
+        joined[:, current_x : current_x + alignment.size()[1], current_y : current_y + alignment.size()[2]] = alignment
+        current_x += alignment.size()[1]
+        current_y += alignment.size()[2]
+    return joined
+def synthesize(
+    model,
+    text,
+    symbols=DEFAULT_ALPHABET,
+    graph_path=None,
+    audio_path=None,
+    vocoder=None,
+    silence_padding=0.15,
+    sample_rate=22050,
+    max_decoder_steps=1000,
+    split_text=False,
+):
+    """
+    Synthesise text for a given model.
+    Produces graph and/or audio file when given.
+    Supports multi line synthesis (seperated by \n).
+    Parameters
+    ----------
+    model : Tacotron2
+        Tacotron2 model
+    text : str/list
+        Text to synthesize (or list of lines to synthesize)
+    symbols : list
+        List of symbols (default is English)
+    graph_path : str (optional)
+        Path to save alignment graph to
+    audio_path : str (optional)
+        Path to save audio file to
+    vocoder : Object (optional)
+        Vocoder model (required if generating audio)
+    silence_padding : float (optional)
+        Seconds of silence to seperate each clip by with multi-line synthesis (default is 0.15)
+    sample_rate : int (optional)
+        Audio sample rate (default is 22050)
+    max_decoder_steps : int (optional)
+        Max decoder steps controls sequence length and memory usage during inference.
+        Increasing this will use more memory but may allow for longer sentences. (default is 1000)
+    split_text : bool (optional)
+        Whether to use the split text tool to convert a block of text into multiple shorter sentences
+        to synthesize (default is True)
+    Raises
+    -------
+    AssertionError
+        If audio_path is given without a vocoder
+    """
+    if audio_path:
+        assert vocoder, "Missing vocoder"
+    if not isinstance(text, list) and split_text:
+        # Split text into multiple lines
+        text = nltk.tokenize.sent_tokenize(text)
+    if isinstance(text, list):
+        # Multi-lines given
+        text = [line.strip() for line in text if line.strip()]
+        mels = []
+        alignments = []
+        for line in text:
+            text = clean_text(line, symbols)
+            sequence = text_to_sequence(text, symbols)
+            _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
+            mels.append(mel_outputs_postnet)
+            alignments.append(alignment)
+        if graph_path:
+            generate_graph(join_alignment_graphs(alignments), graph_path)
+        if audio_path:
+            silence = np.zeros(int(silence_padding * sample_rate)).astype("int16")
+            audio_segments = []
+            for i in range(len(mels)):
+                audio_segments.append(vocoder.generate_audio(mels[i]))
+                if i != len(mels) - 1:
+                    audio_segments.append(silence)
+            audio = np.concatenate(audio_segments)
+            write(audio_path, sample_rate, audio)
+    else:
+        # Single sentence
+        text = clean_text(text.strip(), symbols)
+        sequence = text_to_sequence(text, symbols)
+        _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
+        if graph_path:
+            generate_graph(alignment, graph_path)
+        if audio_path:
+            audio = vocoder.generate_audio(mel_outputs_postnet)
+            write(audio_path, sample_rate, audio)
+if __name__ == "__main__":
+    """Synthesize audio using model and vocoder"""
+    parser = argparse.ArgumentParser(description="Synthesize audio using model and vocoder")
+    parser.add_argument("-m", "--model_path", type=str, help="tacotron2 model path", required=True)
+    parser.add_argument("-vm", "--vocoder_model_path", type=str, help="vocoder model path", required=True)
+    parser.add_argument("-hc", "--hifigan_config_path", type=str, help="hifigan_config path", required=True)
+    parser.add_argument("-t", "--text", type=str, help="text to synthesize", required=True)
+    parser.add_argument("-g", "--graph_output_path", type=str, help="path to save alignment graph to", required=False)
+    parser.add_argument("-a", "--audio_output_path", type=str, help="path to save output audio to", required=False)
+    parser.add_argument("--silence_padding", type=float, help="Padding between sentences in seconds", default=0.15)
+    parser.add_argument("--sample_rate", type=int, help="Audio sample rate", default=22050)
+    args = parser.parse_args()
+    assert os.path.isfile(args.model_path), "Model not found"
+    assert os.path.isfile(args.vocoder_model_path), "vocoder model not found"
+    model = load_model(args.model_path)
+    vocoder = Hifigan(args.vocoder_model_path, args.hifigan_config_path)
+    synthesize(
+        model=model,
+        text=args.text,
+        graph_path=args.graph_output_path,
+        audio_path=args.audio_output_path,
+        vocoder=vocoder,
+        silence_padding=args.silence_padding,
+        sample_rate=args.sample_rate,
+    )

synthesis/vocoders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from synthesis.vocoders.hifigan import Hifigan # noqa

synthesis/vocoders/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (209 Bytes). View file

synthesis/vocoders/__pycache__/hifigan.cpython-38.pyc ADDED Viewed

Binary file (1.75 kB). View file

synthesis/vocoders/__pycache__/hifigan_model.cpython-38.pyc ADDED Viewed

Binary file (9.11 kB). View file

synthesis/vocoders/__pycache__/vocoder.cpython-38.pyc ADDED Viewed

Binary file (879 Bytes). View file

synthesis/vocoders/hifigan.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json
+import torch
+from synthesis.vocoders.hifigan_model import Generator
+from synthesis.vocoders.vocoder import Vocoder, MAX_WAV_VALUE
+class AttrDict(dict):
+    """
+    Credit: https://github.com/jik876/hifi-gan
+    """
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+class Hifigan(Vocoder):
+    def __init__(self, model_path, config_path):
+        with open(config_path) as f:
+            data = f.read()
+        # Use GPU if available
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        h = AttrDict(json.loads(data))
+        self.model = Generator(h).to(device)
+        checkpoint_dict = torch.load(model_path, map_location=device)
+        self.model.load_state_dict(checkpoint_dict["generator"])
+        self.model.eval()
+        self.model.remove_weight_norm()
+    def generate_audio(self, mel_output):
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                mel_output = mel_output.type(torch.cuda.FloatTensor)
+            y_g_hat = self.model(mel_output)
+            audio = y_g_hat.squeeze()
+            audio = audio * MAX_WAV_VALUE
+            audio = audio.cpu().numpy().astype("int16")
+            return audio

synthesis/vocoders/hifigan_model.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Credit: https://github.com/jik876/hifi-gan
+#
+# MIT License
+#
+# Copyright (c) 2020 Jungil Kong
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+LRELU_SLOPE = 0.1
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
+                ),
+                weight_norm(
+                    Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
+                ),
+                weight_norm(
+                    Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

synthesis/vocoders/vocoder.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from abc import ABC, abstractmethod
+MAX_WAV_VALUE = 32768.0
+class Vocoder(ABC):
+    """
+    Produces audio data for tacotron2 mel spectrogram output
+    """
+    @abstractmethod
+    def generate_audio(self, mel_output):
+        """
+        Produces wav audio data for a given mel output.
+        Parameters
+        ----------
+        mel_output : Tensor
+            Mel spectrogram output
+        Returns
+        -------
+        np.array
+            Generated audio data
+        """
+        pass

synthesize.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import argparse
+import os
+import matplotlib.pyplot as plt
+import torch
+import numpy as np
+import matplotlib
+from scipy.io.wavfile import write
+from os.path import dirname, abspath
+import sys
+import nltk
+nltk.download("punkt")
+sys.path.append(dirname(dirname(abspath(__file__))))
+matplotlib.use("Agg")
+from training.tacotron2_model import Tacotron2
+from training.clean_text import clean_text
+from training import DEFAULT_ALPHABET
+from synthesis.vocoders import Hifigan
+def load_model(model_path):
+    """
+    Loads the Tacotron2 model.
+    Uses GPU if available, otherwise uses CPU.
+    Parameters
+    ----------
+    model_path : str
+        Path to tacotron2 model
+    Returns
+    -------
+    Tacotron2
+        Loaded tacotron2 model
+    """
+    if torch.cuda.is_available():
+        model = Tacotron2().cuda()
+        model.load_state_dict(torch.load(model_path)["state_dict"])
+        _ = model.cuda().eval().half()
+    else:
+        model = Tacotron2()
+        model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu"))["state_dict"])
+    return model
+def generate_graph(alignments, filepath, heading=""):
+    """
+    Generates synthesis alignment graph image.
+    Parameters
+    ----------
+    alignments : list
+        Numpy alignment data
+    filepath : str
+        Path to save image to
+    heading : str (optional)
+        Graph heading
+    """
+    data = alignments.float().data.cpu().numpy()[0].T
+    plt.imshow(data, aspect="auto", origin="lower", interpolation="none")
+    if heading:
+        plt.title(heading)
+    plt.savefig(filepath)
+def text_to_sequence(text, symbols):
+    """
+    Generates text sequence for audio file
+    Parameters
+    ----------
+    text : str
+        Text to synthesize
+    symbols : list
+        List of valid symbols
+    """
+    symbol_to_id = {s: i for i, s in enumerate(symbols)}
+    sequence = np.array([[symbol_to_id[s] for s in text if s in symbol_to_id]])
+    if torch.cuda.is_available():
+        return torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
+    else:
+        return torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()
+def join_alignment_graphs(alignments):
+    """
+    Joins multiple alignment graphs.
+    Parameters
+    ----------
+    alignments : list
+        List of alignment Tensors
+    Returns
+    -------
+    Tensor
+        Combined alignment tensor
+    """
+    alignment_sizes = [a.size() for a in alignments]
+    joined = torch.zeros((1, sum([a[1] for a in alignment_sizes]), sum([a[2] for a in alignment_sizes])))
+    current_x = 0
+    current_y = 0
+    for alignment in alignments:
+        joined[:, current_x : current_x + alignment.size()[1], current_y : current_y + alignment.size()[2]] = alignment
+        current_x += alignment.size()[1]
+        current_y += alignment.size()[2]
+    return joined
+def synthesize(
+    model,
+    text,
+    symbols=DEFAULT_ALPHABET,
+    graph_path=None,
+    audio_path=None,
+    vocoder=None,
+    silence_padding=0.15,
+    sample_rate=22050,
+    max_decoder_steps=1000,
+    split_text=False,
+):
+    """
+    Synthesise text for a given model.
+    Produces graph and/or audio file when given.
+    Supports multi line synthesis (seperated by \n).
+    Parameters
+    ----------
+    model : Tacotron2
+        Tacotron2 model
+    text : str/list
+        Text to synthesize (or list of lines to synthesize)
+    symbols : list
+        List of symbols (default is English)
+    graph_path : str (optional)
+        Path to save alignment graph to
+    audio_path : str (optional)
+        Path to save audio file to
+    vocoder : Object (optional)
+        Vocoder model (required if generating audio)
+    silence_padding : float (optional)
+        Seconds of silence to seperate each clip by with multi-line synthesis (default is 0.15)
+    sample_rate : int (optional)
+        Audio sample rate (default is 22050)
+    max_decoder_steps : int (optional)
+        Max decoder steps controls sequence length and memory usage during inference.
+        Increasing this will use more memory but may allow for longer sentences. (default is 1000)
+    split_text : bool (optional)
+        Whether to use the split text tool to convert a block of text into multiple shorter sentences
+        to synthesize (default is True)
+    Raises
+    -------
+    AssertionError
+        If audio_path is given without a vocoder
+    """
+    if audio_path:
+        assert vocoder, "Missing vocoder"
+    if not isinstance(text, list) and split_text:
+        # Split text into multiple lines
+        text = nltk.tokenize.sent_tokenize(text)
+    if isinstance(text, list):
+        # Multi-lines given
+        text = [line.strip() for line in text if line.strip()]
+        mels = []
+        alignments = []
+        for line in text:
+            text = clean_text(line, symbols)
+            sequence = text_to_sequence(text, symbols)
+            _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
+            mels.append(mel_outputs_postnet)
+            alignments.append(alignment)
+        if graph_path:
+            generate_graph(join_alignment_graphs(alignments), graph_path)
+        if audio_path:
+            silence = np.zeros(int(silence_padding * sample_rate)).astype("int16")
+            audio_segments = []
+            for i in range(len(mels)):
+                audio_segments.append(vocoder.generate_audio(mels[i]))
+                if i != len(mels) - 1:
+                    audio_segments.append(silence)
+            audio = np.concatenate(audio_segments)
+            write(audio_path, sample_rate, audio)
+    else:
+        # Single sentence
+        text = clean_text(text.strip(), symbols)
+        sequence = text_to_sequence(text, symbols)
+        _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
+        if graph_path:
+            generate_graph(alignment, graph_path)
+        if audio_path:
+            audio = vocoder.generate_audio(mel_outputs_postnet)
+            write(audio_path, sample_rate, audio)
+if __name__ == "__main__":
+    """Synthesize audio using model and vocoder"""
+    parser = argparse.ArgumentParser(description="Synthesize audio using model and vocoder")
+    parser.add_argument("-m", "--model_path", type=str, help="tacotron2 model path", required=True)
+    parser.add_argument("-vm", "--vocoder_model_path", type=str, help="vocoder model path", required=True)
+    parser.add_argument("-hc", "--hifigan_config_path", type=str, help="hifigan_config path", required=True)
+    parser.add_argument("-t", "--text", type=str, help="text to synthesize", required=True)
+    parser.add_argument("-g", "--graph_output_path", type=str, help="path to save alignment graph to", required=False)
+    parser.add_argument("-a", "--audio_output_path", type=str, help="path to save output audio to", required=False)
+    parser.add_argument("--silence_padding", type=float, help="Padding between sentences in seconds", default=0.15)
+    parser.add_argument("--sample_rate", type=int, help="Audio sample rate", default=22050)
+    args = parser.parse_args()
+    assert os.path.isfile(args.model_path), "Model not found"
+    assert os.path.isfile(args.vocoder_model_path), "vocoder model not found"
+    model = load_model(args.model_path)
+    vocoder = Hifigan(args.vocoder_model_path, args.hifigan_config_path)
+    synthesize(
+        model=model,
+        text=args.text,
+        graph_path=args.graph_output_path,
+        audio_path=args.audio_output_path,
+        vocoder=vocoder,
+        silence_padding=args.silence_padding,
+        sample_rate=args.sample_rate,
+    )

training/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+SEED = 1234
+PUNCTUATION = list("_-!'(),.:;?")
+BASE_SYMBOLS = PUNCTUATION + [" "]
+DEFAULT_ALPHABET = list("_-!'(),.:;? ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
+TRAIN_FILE = "trainlist.txt"
+VALIDATION_FILE = "vallist.txt"

training/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (387 Bytes). View file

training/__pycache__/clean_text.cpython-38.pyc ADDED Viewed

Binary file (3.18 kB). View file

training/clean_text.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import argparse
+import re
+import inflect
+from training import DEFAULT_ALPHABET
+INFLECT_ENGINE = inflect.engine()
+COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])")
+DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)")
+NUMBER_RE = re.compile(r"[0-9]+")
+ORDINALS = re.compile(r"([0-9]+[st|nd|rd|th]+)")
+CURRENCY = re.compile(r"([£|$|€]+[0-9]+)")
+WHITESPACE_RE = re.compile(r"\s+")
+ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+")
+MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"}
+ABBREVIATION_REPLACEMENT = {
+    "mr.": "mister",
+    "mrs.": "misess",
+    "dr.": "doctor",
+    "no.": "number",
+    "st.": "saint",
+    "co.": "company",
+    "jr.": "junior",
+    "maj.": "major",
+    "gen.": "general",
+    "drs.": "doctors",
+    "rev.": "reverend",
+    "lt.": "lieutenant",
+    "hon.": "honorable",
+    "sgt.": "sergeant",
+    "capt.": "captain",
+    "esq.": "esquire",
+    "ltd.": "limited",
+    "col.": "colonel",
+    "ft.": "fort",
+}
+def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True):
+    """
+    Cleans text. This includes:
+    - Replacing monetary terms (i.e. $ -> dollars)
+    - Converting ordinals to full words (i.e. 1st -> first)
+    - Converting numbers to their full word format (i.e. 100 -> one hundred)
+    - Replacing abbreviations (i.e. dr. -> doctor)
+    - Removing invalid characters (non utf-8 or invalid punctuation)
+    Parameters
+    ----------
+    text : str
+        Text to clean
+    symbols : list (optional)
+        List of valid symbols in text (default is English alphabet & punctuation)
+    remove_invalid_characters : bool (optional)
+        Whether to remove characters not in symbols list (default is True)
+    Returns
+    -------
+    str
+        Cleaned text
+    """
+    text = text.strip()
+    text = text.lower()
+    # Convert currency to words
+    money = re.findall(CURRENCY, text)
+    for amount in money:
+        for key, value in MONETARY_REPLACEMENT.items():
+            if key in amount:
+                text = text.replace(amount, amount[1:] + value)
+    # Convert ordinals to words
+    ordinals = re.findall(ORDINALS, text)
+    for ordinal in ordinals:
+        text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal))
+    # Convert comma & decimal numbers to words
+    numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text)
+    for number in numbers:
+        text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
+    # Convert standard numbers to words
+    numbers = re.findall(NUMBER_RE, text)
+    for number in numbers:
+        text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
+    # Replace abbreviations
+    for key, value in ABBREVIATION_REPLACEMENT.items():
+        text = text.replace(" " + key + " ", " " + value + " ")
+    # Collapse whitespace
+    text = re.sub(WHITESPACE_RE, " ", text)
+    # Remove banned characters
+    if remove_invalid_characters:
+        text = "".join([c for c in text if c in symbols])
+    return text
+if __name__ == "__main__":
+    """Script to clean text for training"""
+    parser = argparse.ArgumentParser(description="Clean & improve text for training")
+    parser.add_argument("-f", "--file", help="Text file path", type=str, required=True)
+    parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True)
+    args = parser.parse_args()
+    with open(args.file) as f:
+        rows = f.readlines()
+    cleaned_text = []
+    for row in rows:
+        filename, text = row.split("|")
+        text = clean_text(text)
+        cleaned_text.append(f"{filename}|{text}")
+    with open(args.output, "w") as f:
+        for line in cleaned_text:
+            f.write(line)
+            f.write("\n")

training/tacotron2_model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from training.tacotron2_model.model import Tacotron2  # noqa
+from training.tacotron2_model.loss import Tacotron2Loss  # noqa
+from training.tacotron2_model.collate import TextMelCollate  # noqa
+from training.tacotron2_model.stft import TacotronSTFT  # noqa

training/tacotron2_model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (425 Bytes). View file

training/tacotron2_model/__pycache__/audio_processing.cpython-38.pyc ADDED Viewed

Binary file (4.33 kB). View file

training/tacotron2_model/__pycache__/collate.cpython-38.pyc ADDED Viewed

Binary file (3.37 kB). View file

training/tacotron2_model/__pycache__/layers.cpython-38.pyc ADDED Viewed

Binary file (5.01 kB). View file

training/tacotron2_model/__pycache__/loss.cpython-38.pyc ADDED Viewed

Binary file (2.5 kB). View file

training/tacotron2_model/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (17 kB). View file

training/tacotron2_model/__pycache__/stft.cpython-38.pyc ADDED Viewed

Binary file (6.47 kB). View file

training/tacotron2_model/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (3.78 kB). View file

training/tacotron2_model/audio_processing.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+import numpy as np
+from scipy.signal import get_window
+import librosa.util as librosa_util
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C

training/tacotron2_model/collate.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+class TextMelCollate:
+    """Zero-pads model inputs and targets based on number of frames per setep"""
+    def __init__(self):
+        self.n_frames_per_step = 1
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and mel-spectrogram
+        PARAMS
+        ------
+        batch: [text_normalized, mel_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        input_lengths, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True
+        )
+        max_input_len = input_lengths[0]
+        text_padded = torch.LongTensor(len(batch), max_input_len)
+        text_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            text = batch[ids_sorted_decreasing[i]][0]
+            text_padded[i, : text.size(0)] = text
+        # Right zero-pad mel-spec
+        num_mels = batch[0][1].size(0)
+        max_target_len = max([x[1].size(1) for x in batch])
+        if max_target_len % self.n_frames_per_step != 0:
+            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
+            assert max_target_len % self.n_frames_per_step == 0
+        # include mel padded and gate padded
+        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
+        mel_padded.zero_()
+        gate_padded = torch.FloatTensor(len(batch), max_target_len)
+        gate_padded.zero_()
+        output_lengths = torch.LongTensor(len(batch))
+        for i in range(len(ids_sorted_decreasing)):
+            mel = batch[ids_sorted_decreasing[i]][1]
+            mel_padded[i, :, : mel.size(1)] = mel
+            gate_padded[i, mel.size(1) - 1 :] = 1
+            output_lengths[i] = mel.size(1)
+        return text_padded, input_lengths, mel_padded, gate_padded, output_lengths

training/tacotron2_model/layers.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+from librosa.filters import mel as librosa_mel_fn
+from training.tacotron2_model.audio_processing import dynamic_range_compression
+from training.tacotron2_model.audio_processing import dynamic_range_decompression
+from training.tacotron2_model.stft import STFT
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain="linear"):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain="linear",
+    ):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert kernel_size % 2 == 1
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=80,
+        sampling_rate=22050,
+        mel_fmin=0.0,
+        mel_fmax=8000.0,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1
+        assert torch.max(y.data) <= 1
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        return mel_output

training/tacotron2_model/loss.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+from torch import nn
+class Tacotron2Loss(nn.Module):
+    def __init__(self):
+        super(Tacotron2Loss, self).__init__()
+    def forward(self, model_output, targets):
+        mel_target, gate_target = targets[0], targets[1]
+        mel_target.requires_grad = False
+        gate_target.requires_grad = False
+        gate_target = gate_target.view(-1, 1)
+        mel_out, mel_out_postnet, gate_out, _ = model_output
+        gate_out = gate_out.view(-1, 1)
+        mel_loss = nn.MSELoss()(mel_out, mel_target) + nn.MSELoss()(mel_out_postnet, mel_target)
+        gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
+        return mel_loss + gate_loss

training/tacotron2_model/model.py ADDED Viewed

	@@ -0,0 +1,609 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+from math import sqrt
+import torch
+from torch.autograd import Variable
+from torch import nn
+from torch.nn import functional as F
+from training.tacotron2_model.layers import ConvNorm, LinearNorm
+from training.tacotron2_model.utils import to_gpu, get_mask_from_lengths, get_x
+class LocationLayer(nn.Module):
+    def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
+        super(LocationLayer, self).__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = ConvNorm(
+            2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1
+        )
+        self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain="tanh")
+    def forward(self, attention_weights_cat):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+class Attention(nn.Module):
+    def __init__(
+        self,
+        attention_rnn_dim,
+        embedding_dim,
+        attention_dim,
+        attention_location_n_filters,
+        attention_location_kernel_size,
+    ):
+        super(Attention, self).__init__()
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain="tanh")
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain="tanh")
+        self.v = LinearNorm(attention_dim, 1, bias=False)
+        self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim)
+        self.score_mask_value = -float("inf")
+    def get_alignment_energies(self, query, processed_memory, attention_weights_cat):
+        """
+        PARAMS
+        ------
+        query: decoder output (batch, n_mel_channels * n_frames_per_step)
+        processed_memory: processed encoder outputs (B, T_in, attention_dim)
+        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
+        RETURNS
+        -------
+        alignment (batch, max_time)
+        """
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_memory))
+        energies = energies.squeeze(-1)
+        return energies
+    def forward(self, attention_hidden_state, memory, processed_memory, attention_weights_cat, mask):
+        """
+        PARAMS
+        ------
+        attention_hidden_state: attention rnn last output
+        memory: encoder outputs
+        processed_memory: processed encoder outputs
+        attention_weights_cat: previous and cummulative attention weights
+        mask: binary mask for padded data
+        """
+        alignment = self.get_alignment_energies(attention_hidden_state, processed_memory, attention_weights_cat)
+        if mask is not None:
+            alignment.data.masked_fill_(mask, self.score_mask_value)
+        attention_weights = F.softmax(alignment, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights
+class Prenet(nn.Module):
+    def __init__(self, in_dim, sizes):
+        super(Prenet, self).__init__()
+        in_sizes = [in_dim] + sizes[:-1]
+        self.layers = nn.ModuleList(
+            [LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes)]
+        )
+    def forward(self, x):
+        for linear in self.layers:
+            x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
+        return x
+class Postnet(nn.Module):
+    """Postnet
+    - Five 1-d convolution with 512 channels and kernel size 5
+    """
+    def __init__(self, n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions):
+        super(Postnet, self).__init__()
+        self.convolutions = nn.ModuleList()
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(
+                    n_mel_channels,
+                    postnet_embedding_dim,
+                    kernel_size=postnet_kernel_size,
+                    stride=1,
+                    padding=int((postnet_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="tanh",
+                ),
+                nn.BatchNorm1d(postnet_embedding_dim),
+            )
+        )
+        for i in range(1, postnet_n_convolutions - 1):
+            self.convolutions.append(
+                nn.Sequential(
+                    ConvNorm(
+                        postnet_embedding_dim,
+                        postnet_embedding_dim,
+                        kernel_size=postnet_kernel_size,
+                        stride=1,
+                        padding=int((postnet_kernel_size - 1) / 2),
+                        dilation=1,
+                        w_init_gain="tanh",
+                    ),
+                    nn.BatchNorm1d(postnet_embedding_dim),
+                )
+            )
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(
+                    postnet_embedding_dim,
+                    n_mel_channels,
+                    kernel_size=postnet_kernel_size,
+                    stride=1,
+                    padding=int((postnet_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="linear",
+                ),
+                nn.BatchNorm1d(n_mel_channels),
+            )
+        )
+    def forward(self, x):
+        for i in range(len(self.convolutions) - 1):
+            x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
+        x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
+        return x
+class Encoder(nn.Module):
+    """Encoder module:
+    - Three 1-d convolution banks
+    - Bidirectional LSTM
+    """
+    def __init__(self, encoder_kernel_size, encoder_n_convolutions, encoder_embedding_dim):
+        super(Encoder, self).__init__()
+        convolutions = []
+        for _ in range(encoder_n_convolutions):
+            conv_layer = nn.Sequential(
+                ConvNorm(
+                    encoder_embedding_dim,
+                    encoder_embedding_dim,
+                    kernel_size=encoder_kernel_size,
+                    stride=1,
+                    padding=int((encoder_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="relu",
+                ),
+                nn.BatchNorm1d(encoder_embedding_dim),
+            )
+            convolutions.append(conv_layer)
+        self.convolutions = nn.ModuleList(convolutions)
+        self.lstm = nn.LSTM(
+            encoder_embedding_dim, int(encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True
+        )
+    def forward(self, x, input_lengths):
+        for conv in self.convolutions:
+            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+        x = x.transpose(1, 2)
+        # pytorch tensor are not reversible, hence the conversion
+        input_lengths = input_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True)
+        self.lstm.flatten_parameters()
+        outputs, _ = self.lstm(x)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+        return outputs
+    def inference(self, x):
+        for conv in self.convolutions:
+            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+        x = x.transpose(1, 2)
+        self.lstm.flatten_parameters()
+        outputs, _ = self.lstm(x)
+        return outputs
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_frames_per_step,
+        encoder_embedding_dim,
+        attention_dim,
+        attention_rnn_dim,
+        attention_location_n_filters,
+        attention_location_kernel_size,
+        decoder_rnn_dim,
+        prenet_dim,
+        max_decoder_steps,
+        gate_threshold,
+        p_attention_dropout,
+        p_decoder_dropout,
+    ):
+        super(Decoder, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.n_frames_per_step = n_frames_per_step
+        self.encoder_embedding_dim = encoder_embedding_dim
+        self.attention_rnn_dim = attention_rnn_dim
+        self.decoder_rnn_dim = decoder_rnn_dim
+        self.prenet_dim = prenet_dim
+        self.max_decoder_steps = max_decoder_steps
+        self.gate_threshold = gate_threshold
+        self.p_attention_dropout = p_attention_dropout
+        self.p_decoder_dropout = p_decoder_dropout
+        self.prenet = Prenet(n_mel_channels * n_frames_per_step, [prenet_dim, prenet_dim])
+        self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim)
+        self.attention_layer = Attention(
+            attention_rnn_dim,
+            encoder_embedding_dim,
+            attention_dim,
+            attention_location_n_filters,
+            attention_location_kernel_size,
+        )
+        self.decoder_rnn = nn.LSTMCell(attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, 1)
+        self.linear_projection = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_mel_channels * n_frames_per_step)
+        self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, 1, bias=True, w_init_gain="sigmoid")
+    def get_go_frame(self, memory):
+        """Gets all zeros frames to use as first decoder input
+        PARAMS
+        ------
+        memory: decoder outputs
+        RETURNS
+        -------
+        decoder_input: all zeros frames
+        """
+        B = memory.size(0)
+        decoder_input = Variable(memory.data.new(B, self.n_mel_channels * self.n_frames_per_step).zero_())
+        return decoder_input
+    def initialize_decoder_states(self, memory, mask):
+        """Initializes attention rnn states, decoder rnn states, attention
+        weights, attention cumulative weights, attention context, stores memory
+        and stores processed memory
+        PARAMS
+        ------
+        memory: Encoder outputs
+        mask: Mask for padded data if training, expects None for inference
+        """
+        B = memory.size(0)
+        MAX_TIME = memory.size(1)
+        self.attention_hidden = Variable(memory.data.new(B, self.attention_rnn_dim).zero_())
+        self.attention_cell = Variable(memory.data.new(B, self.attention_rnn_dim).zero_())
+        self.decoder_hidden = Variable(memory.data.new(B, self.decoder_rnn_dim).zero_())
+        self.decoder_cell = Variable(memory.data.new(B, self.decoder_rnn_dim).zero_())
+        self.attention_weights = Variable(memory.data.new(B, MAX_TIME).zero_())
+        self.attention_weights_cum = Variable(memory.data.new(B, MAX_TIME).zero_())
+        self.attention_context = Variable(memory.data.new(B, self.encoder_embedding_dim).zero_())
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+    def parse_decoder_inputs(self, decoder_inputs):
+        """Prepares decoder inputs, i.e. mel outputs
+        PARAMS
+        ------
+        decode encoder_kernel_size=5,
+        encoder_n_convolutions=3,
+        encoder_embedding_dim=512,r_inputs: inputs used for teacher-forced training, i.e. mel-specs
+        RETURNS
+        -------
+        inputs: processed decoder inputs
+        """
+        # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
+        decoder_inputs = decoder_inputs.transpose(1, 2)
+        decoder_inputs = decoder_inputs.view(
+            decoder_inputs.size(0), int(decoder_inputs.size(1) / self.n_frames_per_step), -1
+        )
+        # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
+        decoder_inputs = decoder_inputs.transpose(0, 1)
+        return decoder_inputs
+    def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
+        """Prepares decoder outputs for output
+        PARAMS
+        ------
+        mel_outputs:
+        gate_outputs: gate output energies
+        alignments:
+        RETURNS
+        -------
+        mel_outputs:
+        gate_outpust: gate output energies
+        alignments:
+        """
+        # (T_out, B) -> (B, T_out)
+        alignments = torch.stack(alignments).transpose(0, 1)
+        # (T_out, B) -> (B, T_out)
+        gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
+        gate_outputs = gate_outputs.contiguous()
+        # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
+        mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
+        # decouple frames per step
+        mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, self.n_mel_channels)
+        # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
+        mel_outputs = mel_outputs.transpose(1, 2)
+        return mel_outputs, gate_outputs, alignments
+    def decode(self, decoder_input):
+        """Decoder step using stored states, attention and memory
+        PARAMS
+        ------
+        decoder_input: previous mel output
+        RETURNS
+        -------
+        mel_output:
+        gate_output: gate output energies
+        attention_weights:
+        """
+        cell_input = torch.cat((decoder_input, self.attention_context), -1)
+        self.attention_hidden, self.attention_cell = self.attention_rnn(
+            cell_input, (self.attention_hidden, self.attention_cell)
+        )
+        self.attention_hidden = F.dropout(self.attention_hidden, self.p_attention_dropout, self.training)
+        attention_weights_cat = torch.cat(
+            (self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1
+        )
+        self.attention_context, self.attention_weights = self.attention_layer(
+            self.attention_hidden, self.memory, self.processed_memory, attention_weights_cat, self.mask
+        )
+        self.attention_weights_cum += self.attention_weights
+        decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+            decoder_input, (self.decoder_hidden, self.decoder_cell)
+        )
+        self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
+        decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
+        decoder_output = self.linear_projection(decoder_hidden_attention_context)
+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+        return decoder_output, gate_prediction, self.attention_weights
+    def forward(self, memory, decoder_inputs, memory_lengths, device):
+        """Decoder forward pass for training
+        PARAMS
+        ------
+        memory: Encoder outputs
+        decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
+        memory_lengths: Encoder output lengths for attention masking.
+        RETURNS
+        -------
+        mel_outputs: mel outputs from the decoder
+        gate_outputs: gate outputs from the decoder
+        alignments: sequence of attention weights from the decoder
+        """
+        decoder_input = self.get_go_frame(memory).unsqueeze(0)
+        decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
+        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
+        decoder_inputs = self.prenet(decoder_inputs)
+        self.initialize_decoder_states(memory, mask=~get_mask_from_lengths(memory_lengths, device))
+        mel_outputs, gate_outputs, alignments = [], [], []
+        while len(mel_outputs) < decoder_inputs.size(0) - 1:
+            decoder_input = decoder_inputs[len(mel_outputs)]
+            mel_output, gate_output, attention_weights = self.decode(decoder_input)
+            mel_outputs += [mel_output.squeeze(1)]
+            gate_outputs += [gate_output.squeeze(1)]
+            alignments += [attention_weights]
+        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
+        return mel_outputs, gate_outputs, alignments
+    def inference(self, memory, max_decoder_steps=None):
+        """Decoder inference
+        PARAMS
+        ------
+        memory: Encoder outputs
+        RETURNS
+        -------
+        mel_outputs: mel outputs from the decoder
+        gate_outputs: gate outputs from the decoder
+        alignments: sequence of attention weights from the decoder
+        """
+        if not max_decoder_steps:
+            # Use default max decoder steps if not given
+            max_decoder_steps = self.max_decoder_steps
+        decoder_input = self.get_go_frame(memory)
+        self.initialize_decoder_states(memory, mask=None)
+        mel_outputs, gate_outputs, alignments = [], [], []
+        while True:
+            decoder_input = self.prenet(decoder_input)
+            mel_output, gate_output, alignment = self.decode(decoder_input)
+            mel_outputs += [mel_output.squeeze(1)]
+            gate_outputs += [gate_output]
+            alignments += [alignment]
+            if torch.sigmoid(gate_output.data) > self.gate_threshold:
+                break
+            elif len(mel_outputs) == max_decoder_steps:
+                raise Exception(
+                    "Warning! Reached max decoder steps. Either the model is low quality or the given sentence is too short/long"
+                )
+            decoder_input = mel_output
+        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
+        return mel_outputs, gate_outputs, alignments
+class Tacotron2(nn.Module):
+    def __init__(
+        self,
+        mask_padding=True,
+        fp16_run=False,
+        n_mel_channels=80,
+        n_symbols=148,
+        symbols_embedding_dim=512,
+        encoder_kernel_size=5,
+        encoder_n_convolutions=3,
+        encoder_embedding_dim=512,
+        attention_rnn_dim=1024,
+        attention_dim=128,
+        attention_location_n_filters=32,
+        attention_location_kernel_size=31,
+        decoder_rnn_dim=1024,
+        prenet_dim=256,
+        max_decoder_steps=1000,
+        gate_threshold=0.5,
+        p_attention_dropout=0.1,
+        p_decoder_dropout=0.1,
+        postnet_embedding_dim=512,
+        postnet_kernel_size=5,
+        postnet_n_convolutions=5,
+    ):
+        super(Tacotron2, self).__init__()
+        self.mask_padding = mask_padding
+        self.fp16_run = fp16_run
+        self.n_mel_channels = n_mel_channels
+        self.n_frames_per_step = 1
+        self.embedding = nn.Embedding(n_symbols, symbols_embedding_dim)
+        std = sqrt(2.0 / (n_symbols + symbols_embedding_dim))
+        val = sqrt(3.0) * std  # uniform bounds for std
+        self.embedding.weight.data.uniform_(-val, val)
+        self.encoder = Encoder(encoder_kernel_size, encoder_n_convolutions, encoder_embedding_dim)
+        self.decoder = Decoder(
+            n_mel_channels,
+            self.n_frames_per_step,
+            encoder_embedding_dim,
+            attention_dim,
+            attention_rnn_dim,
+            attention_location_n_filters,
+            attention_location_kernel_size,
+            decoder_rnn_dim,
+            prenet_dim,
+            max_decoder_steps,
+            gate_threshold,
+            p_attention_dropout,
+            p_decoder_dropout,
+        )
+        self.postnet = Postnet(n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions)
+    def parse_batch(self, batch):
+        text_padded, input_lengths, mel_padded, gate_padded, output_lengths = batch
+        text_padded = to_gpu(text_padded).long()
+        input_lengths = to_gpu(input_lengths).long()
+        max_len = torch.max(input_lengths.data).item()
+        mel_padded = to_gpu(mel_padded).float()
+        gate_padded = to_gpu(gate_padded).float()
+        output_lengths = to_gpu(output_lengths).long()
+        return ((text_padded, input_lengths, mel_padded, max_len, output_lengths), (mel_padded, gate_padded))
+    def parse_output(self, outputs, output_lengths, mask_size, alignment_mask_size, device):
+        if self.mask_padding:
+            mask = ~get_mask_from_lengths(output_lengths, device, mask_size)
+            mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
+            mask = mask.permute(1, 0, 2)
+            outputs[0].data.masked_fill_(mask, 0.0)
+            outputs[1].data.masked_fill_(mask, 0.0)
+            outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies
+            if outputs[3].size(2) != alignment_mask_size:
+                outputs[3] = nn.ConstantPad1d((0, alignment_mask_size - outputs[3].size(2)), 0)(outputs[3])
+        return outputs
+    def forward(self, inputs, mask_size, alignment_mask_size):
+        text_inputs, text_lengths, mels, output_lengths = get_x(inputs)
+        device = text_inputs.device
+        text_lengths, output_lengths = text_lengths.data, output_lengths.data
+        embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
+        encoder_outputs = self.encoder(embedded_inputs, text_lengths)
+        mel_outputs, gate_outputs, alignments = self.decoder(
+            encoder_outputs, mels, memory_lengths=text_lengths, device=device
+        )
+        mel_outputs_postnet = self.postnet(mel_outputs)
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+        return self.parse_output(
+            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
+            output_lengths,
+            mask_size,
+            alignment_mask_size,
+            device,
+        )
+    def inference(self, inputs, max_decoder_steps=None):
+        embedded_inputs = self.embedding(inputs).transpose(1, 2)
+        encoder_outputs = self.encoder.inference(embedded_inputs)
+        mel_outputs, gate_outputs, alignments = self.decoder.inference(encoder_outputs, max_decoder_steps)
+        mel_outputs_postnet = self.postnet(mel_outputs)
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+        return [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]

training/tacotron2_model/stft.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from librosa.filters import mel as librosa_mel_fn
+from training.tacotron2_model.audio_processing import (
+    window_sumsquare,
+    dynamic_range_compression,
+    dynamic_range_decompression,
+)
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length=800, hop_length=200, win_length=800, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])])
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1), (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), mode="reflect"
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data, Variable(self.forward_basis, requires_grad=False), stride=self.hop_length, padding=0
+        )
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat([magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1)
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(np.where(window_sum > tiny(window_sum))[0])
+            window_sum = torch.autograd.Variable(torch.from_numpy(window_sum), requires_grad=False)
+            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=80,
+        sampling_rate=22050,
+        mel_fmin=0.0,
+        mel_fmax=8000.0,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1
+        assert torch.max(y.data) <= 1
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        return mel_output

training/tacotron2_model/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+def get_mask_from_lengths(lengths, device, max_len=None):
+    if not max_len:
+        max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)).to(device)
+    mask = (ids < lengths.to(device).unsqueeze(1)).bool()
+    return mask
+def load_wav_to_torch(full_path):
+    sampling_rate, data = read(full_path)
+    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+def load_filepaths_and_text(filename, split="|"):
+    with open(filename, encoding="utf-8") as f:
+        filepaths_and_text = [line.strip().split(split) for line in f]
+    return filepaths_and_text
+def to_gpu(x):
+    x = x.contiguous().cuda()
+    return torch.autograd.Variable(x)
+def get_sizes(data):
+    _, input_lengths, _, _, output_lengths = data
+    output_length_size = torch.max(output_lengths.data).item()
+    input_length_size = torch.max(input_lengths.data).item()
+    return input_length_size, output_length_size
+def get_y(data):
+    _, _, mel_padded, gate_padded, _ = data
+    mel_padded = to_gpu(mel_padded).float()
+    gate_padded = to_gpu(gate_padded).float()
+    return mel_padded, gate_padded
+def get_x(data):
+    text_padded, input_lengths, mel_padded, _, output_lengths = data
+    text_padded = to_gpu(text_padded).long()
+    input_lengths = to_gpu(input_lengths).long()
+    mel_padded = to_gpu(mel_padded).float()
+    output_lengths = to_gpu(output_lengths).long()
+    return text_padded, input_lengths, mel_padded, output_lengths
+def process_batch(batch, model):
+    input_length_size, output_length_size = get_sizes(batch)
+    y = get_y(batch)
+    y_pred = model(batch, mask_size=output_length_size, alignment_mask_size=input_length_size)
+    return y, y_pred

weights/custom_pctest/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

weights/custom_pctest/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a81953d408ea577ffdef9e4a6ba3d17feb3197db930032ae795ac0663d38fd7
+size 55823149

weights/hifiganvocoderdemo/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

weights/hifiganvocoderdemo/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:771eaf4876485a35e25577563d390c262e23c2421e4a8c929eacfde34a5b7a60
+size 55788858