kevinwang676 commited on 23 days ago

Commit

c120033

verified ·

1 Parent(s): fe5241b

Add files using upload-large-folder tool

Browse files

Files changed (23) hide show

third_party/Matcha-TTS/configs/logger/csv.yaml +7 -0
third_party/Matcha-TTS/configs/trainer/default.yaml +20 -0
third_party/Matcha-TTS/matcha/app.py +357 -0
third_party/Matcha-TTS/matcha/hifigan/README.md +101 -0
third_party/Matcha-TTS/matcha/hifigan/__init__.py +0 -0
third_party/Matcha-TTS/matcha/hifigan/config.py +28 -0
third_party/Matcha-TTS/matcha/hifigan/denoiser.py +64 -0
third_party/Matcha-TTS/matcha/hifigan/env.py +17 -0
third_party/Matcha-TTS/matcha/hifigan/models.py +368 -0
third_party/Matcha-TTS/matcha/models/__init__.py +0 -0
third_party/Matcha-TTS/matcha/models/baselightningmodule.py +209 -0
third_party/Matcha-TTS/matcha/models/components/__init__.py +0 -0
third_party/Matcha-TTS/matcha/models/components/flow_matching.py +132 -0
third_party/Matcha-TTS/matcha/models/matcha_tts.py +239 -0
third_party/Matcha-TTS/matcha/onnx/__init__.py +0 -0
third_party/Matcha-TTS/matcha/onnx/export.py +181 -0
third_party/Matcha-TTS/matcha/text/cleaners.py +116 -0
third_party/Matcha-TTS/matcha/utils/__init__.py +5 -0
third_party/Matcha-TTS/matcha/utils/instantiators.py +56 -0
third_party/Matcha-TTS/matcha/utils/model.py +90 -0
third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py +7 -0
third_party/Matcha-TTS/matcha/utils/rich_utils.py +101 -0
third_party/Matcha-TTS/matcha/utils/utils.py +219 -0

third_party/Matcha-TTS/configs/logger/csv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# csv logger built in lightning
+csv:
+  _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
+  save_dir: "${paths.output_dir}"
+  name: "csv/"
+  prefix: ""

third_party/Matcha-TTS/configs/trainer/default.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_target_: lightning.pytorch.trainer.Trainer
+default_root_dir: ${paths.output_dir}
+max_epochs: -1
+accelerator: gpu
+devices: [0]
+# mixed precision for extra speed-up
+precision: 16-mixed
+# perform a validation loop every N training epochs
+check_val_every_n_epoch: 1
+# set True to to ensure deterministic results
+# makes training slower but gives more reproducibility than just setting seeds
+deterministic: False
+gradient_clip_val: 5.0

third_party/Matcha-TTS/matcha/app.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import tempfile
+from argparse import Namespace
+from pathlib import Path
+import gradio as gr
+import soundfile as sf
+import torch
+from matcha.cli import (
+    MATCHA_URLS,
+    VOCODER_URLS,
+    assert_model_downloaded,
+    get_device,
+    load_matcha,
+    load_vocoder,
+    process_text,
+    to_waveform,
+)
+from matcha.utils.utils import get_user_data_dir, plot_tensor
+LOCATION = Path(get_user_data_dir())
+args = Namespace(
+    cpu=False,
+    model="matcha_vctk",
+    vocoder="hifigan_univ_v1",
+    spk=0,
+)
+CURRENTLY_LOADED_MODEL = args.model
+def MATCHA_TTS_LOC(x):
+    return LOCATION / f"{x}.ckpt"
+def VOCODER_LOC(x):
+    return LOCATION / f"{x}"
+LOGO_URL = "https://shivammehta25.github.io/Matcha-TTS/images/logo.png"
+RADIO_OPTIONS = {
+    "Multi Speaker (VCTK)": {
+        "model": "matcha_vctk",
+        "vocoder": "hifigan_univ_v1",
+    },
+    "Single Speaker (LJ Speech)": {
+        "model": "matcha_ljspeech",
+        "vocoder": "hifigan_T2_v1",
+    },
+}
+# Ensure all the required models are downloaded
+assert_model_downloaded(MATCHA_TTS_LOC("matcha_ljspeech"), MATCHA_URLS["matcha_ljspeech"])
+assert_model_downloaded(VOCODER_LOC("hifigan_T2_v1"), VOCODER_URLS["hifigan_T2_v1"])
+assert_model_downloaded(MATCHA_TTS_LOC("matcha_vctk"), MATCHA_URLS["matcha_vctk"])
+assert_model_downloaded(VOCODER_LOC("hifigan_univ_v1"), VOCODER_URLS["hifigan_univ_v1"])
+device = get_device(args)
+# Load default model
+model = load_matcha(args.model, MATCHA_TTS_LOC(args.model), device)
+vocoder, denoiser = load_vocoder(args.vocoder, VOCODER_LOC(args.vocoder), device)
+def load_model(model_name, vocoder_name):
+    model = load_matcha(model_name, MATCHA_TTS_LOC(model_name), device)
+    vocoder, denoiser = load_vocoder(vocoder_name, VOCODER_LOC(vocoder_name), device)
+    return model, vocoder, denoiser
+def load_model_ui(model_type, textbox):
+    model_name, vocoder_name = RADIO_OPTIONS[model_type]["model"], RADIO_OPTIONS[model_type]["vocoder"]
+    global model, vocoder, denoiser, CURRENTLY_LOADED_MODEL  # pylint: disable=global-statement
+    if CURRENTLY_LOADED_MODEL != model_name:
+        model, vocoder, denoiser = load_model(model_name, vocoder_name)
+        CURRENTLY_LOADED_MODEL = model_name
+    if model_name == "matcha_ljspeech":
+        spk_slider = gr.update(visible=False, value=-1)
+        single_speaker_examples = gr.update(visible=True)
+        multi_speaker_examples = gr.update(visible=False)
+        length_scale = gr.update(value=0.95)
+    else:
+        spk_slider = gr.update(visible=True, value=0)
+        single_speaker_examples = gr.update(visible=False)
+        multi_speaker_examples = gr.update(visible=True)
+        length_scale = gr.update(value=0.85)
+    return (
+        textbox,
+        gr.update(interactive=True),
+        spk_slider,
+        single_speaker_examples,
+        multi_speaker_examples,
+        length_scale,
+    )
+@torch.inference_mode()
+def process_text_gradio(text):
+    output = process_text(1, text, device)
+    return output["x_phones"][1::2], output["x"], output["x_lengths"]
+@torch.inference_mode()
+def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk):
+    spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None
+    output = model.synthesise(
+        text,
+        text_length,
+        n_timesteps=n_timesteps,
+        temperature=temperature,
+        spks=spk,
+        length_scale=length_scale,
+    )
+    output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        sf.write(fp.name, output["waveform"], 22050, "PCM_24")
+    return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy())
+def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scale, spk):
+    global CURRENTLY_LOADED_MODEL  # pylint: disable=global-statement
+    if CURRENTLY_LOADED_MODEL != "matcha_vctk":
+        global model, vocoder, denoiser  # pylint: disable=global-statement
+        model, vocoder, denoiser = load_model("matcha_vctk", "hifigan_univ_v1")
+        CURRENTLY_LOADED_MODEL = "matcha_vctk"
+    phones, text, text_lengths = process_text_gradio(text)
+    audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
+    return phones, audio, mel_spectrogram
+def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1):
+    global CURRENTLY_LOADED_MODEL  # pylint: disable=global-statement
+    if CURRENTLY_LOADED_MODEL != "matcha_ljspeech":
+        global model, vocoder, denoiser  # pylint: disable=global-statement
+        model, vocoder, denoiser = load_model("matcha_ljspeech", "hifigan_T2_v1")
+        CURRENTLY_LOADED_MODEL = "matcha_ljspeech"
+    phones, text, text_lengths = process_text_gradio(text)
+    audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
+    return phones, audio, mel_spectrogram
+def main():
+    description = """# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
+    ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
+    We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis. Our method:
+    * Is probabilistic
+    * Has compact memory footprint
+    * Sounds highly natural
+    * Is very fast to synthesise from
+    Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS). Read our [arXiv preprint for more details](https://arxiv.org/abs/2309.03199).
+    Code is available in our [GitHub repository](https://github.com/shivammehta25/Matcha-TTS), along with pre-trained models.
+    Cached examples are available at the bottom of the page.
+    """
+    with gr.Blocks(title="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching") as demo:
+        processed_text = gr.State(value=None)
+        processed_text_len = gr.State(value=None)
+        with gr.Box():
+            with gr.Row():
+                gr.Markdown(description, scale=3)
+                with gr.Column():
+                    gr.Image(LOGO_URL, label="Matcha-TTS logo", height=50, width=50, scale=1, show_label=False)
+                    html = '<br><iframe width="560" height="315" src="https://www.youtube.com/embed/xmvJkz3bqw0?si=jN7ILyDsbPwJCGoa" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
+                    gr.HTML(html)
+        with gr.Box():
+            radio_options = list(RADIO_OPTIONS.keys())
+            model_type = gr.Radio(
+                radio_options, value=radio_options[0], label="Choose a Model", interactive=True, container=False
+            )
+            with gr.Row():
+                gr.Markdown("# Text Input")
+            with gr.Row():
+                text = gr.Textbox(value="", lines=2, label="Text to synthesise", scale=3)
+                spk_slider = gr.Slider(
+                    minimum=0, maximum=107, step=1, value=args.spk, label="Speaker ID", interactive=True, scale=1
+                )
+            with gr.Row():
+                gr.Markdown("### Hyper parameters")
+            with gr.Row():
+                n_timesteps = gr.Slider(
+                    label="Number of ODE steps",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=10,
+                    interactive=True,
+                )
+                length_scale = gr.Slider(
+                    label="Length scale (Speaking rate)",
+                    minimum=0.5,
+                    maximum=1.5,
+                    step=0.05,
+                    value=1.0,
+                    interactive=True,
+                )
+                mel_temp = gr.Slider(
+                    label="Sampling temperature",
+                    minimum=0.00,
+                    maximum=2.001,
+                    step=0.16675,
+                    value=0.667,
+                    interactive=True,
+                )
+                synth_btn = gr.Button("Synthesise")
+        with gr.Box():
+            with gr.Row():
+                gr.Markdown("### Phonetised text")
+                phonetised_text = gr.Textbox(interactive=False, scale=10, label="Phonetised text")
+        with gr.Box():
+            with gr.Row():
+                mel_spectrogram = gr.Image(interactive=False, label="mel spectrogram")
+                # with gr.Row():
+                audio = gr.Audio(interactive=False, label="Audio")
+        with gr.Row(visible=False) as example_row_lj_speech:
+            examples = gr.Examples(  # pylint: disable=unused-variable
+                examples=[
+                    [
+                        "We propose Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up O D E-based speech synthesis.",
+                        50,
+                        0.677,
+                        0.95,
+                    ],
+                    [
+                        "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
+                        2,
+                        0.677,
+                        0.95,
+                    ],
+                    [
+                        "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
+                        4,
+                        0.677,
+                        0.95,
+                    ],
+                    [
+                        "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
+                        10,
+                        0.677,
+                        0.95,
+                    ],
+                    [
+                        "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
+                        50,
+                        0.677,
+                        0.95,
+                    ],
+                    [
+                        "The narrative of these events is based largely on the recollections of the participants.",
+                        10,
+                        0.677,
+                        0.95,
+                    ],
+                    [
+                        "The jury did not believe him, and the verdict was for the defendants.",
+                        10,
+                        0.677,
+                        0.95,
+                    ],
+                ],
+                fn=ljspeech_example_cacher,
+                inputs=[text, n_timesteps, mel_temp, length_scale],
+                outputs=[phonetised_text, audio, mel_spectrogram],
+                cache_examples=True,
+            )
+        with gr.Row() as example_row_multispeaker:
+            multi_speaker_examples = gr.Examples(  # pylint: disable=unused-variable
+                examples=[
+                    [
+                        "Hello everyone! I am speaker 0 and I am here to tell you that Matcha-TTS is amazing!",
+                        10,
+                        0.677,
+                        0.85,
+                        0,
+                    ],
+                    [
+                        "Hello everyone! I am speaker 16 and I am here to tell you that Matcha-TTS is amazing!",
+                        10,
+                        0.677,
+                        0.85,
+                        16,
+                    ],
+                    [
+                        "Hello everyone! I am speaker 44 and I am here to tell you that Matcha-TTS is amazing!",
+                        50,
+                        0.677,
+                        0.85,
+                        44,
+                    ],
+                    [
+                        "Hello everyone! I am speaker 45 and I am here to tell you that Matcha-TTS is amazing!",
+                        50,
+                        0.677,
+                        0.85,
+                        45,
+                    ],
+                    [
+                        "Hello everyone! I am speaker 58 and I am here to tell you that Matcha-TTS is amazing!",
+                        4,
+                        0.677,
+                        0.85,
+                        58,
+                    ],
+                ],
+                fn=multispeaker_example_cacher,
+                inputs=[text, n_timesteps, mel_temp, length_scale, spk_slider],
+                outputs=[phonetised_text, audio, mel_spectrogram],
+                cache_examples=True,
+                label="Multi Speaker Examples",
+            )
+        model_type.change(lambda x: gr.update(interactive=False), inputs=[synth_btn], outputs=[synth_btn]).then(
+            load_model_ui,
+            inputs=[model_type, text],
+            outputs=[text, synth_btn, spk_slider, example_row_lj_speech, example_row_multispeaker, length_scale],
+        )
+        synth_btn.click(
+            fn=process_text_gradio,
+            inputs=[
+                text,
+            ],
+            outputs=[phonetised_text, processed_text, processed_text_len],
+            api_name="matcha_tts",
+            queue=True,
+        ).then(
+            fn=synthesise_mel,
+            inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale, spk_slider],
+            outputs=[audio, mel_spectrogram],
+        )
+        demo.queue().launch(share=True)
+if __name__ == "__main__":
+    main()

third_party/Matcha-TTS/matcha/hifigan/README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
+### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
+In our [paper](https://arxiv.org/abs/2010.05646),
+we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
+We provide our implementation and pretrained models as open source in this repository.
+**Abstract :**
+Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
+Although such methods improve the sampling efficiency and memory usage,
+their sample quality has not yet reached that of autoregressive and flow-based generative models.
+In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
+As speech audio consists of sinusoidal signals with various periods,
+we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
+A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
+demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
+real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
+speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
+faster than real-time on CPU with comparable quality to an autoregressive counterpart.
+Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
+## Pre-requisites
+1. Python >= 3.6
+2. Clone this repository.
+3. Install python requirements. Please refer [requirements.txt](requirements.txt)
+4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
+   And move all wav files to `LJSpeech-1.1/wavs`
+## Training
+```
+python train.py --config config_v1.json
+```
+To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
+Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
+You can change the path by adding `--checkpoint_path` option.
+Validation loss during training with V1 generator.<br>
+![validation loss](./validation_loss.png)
+## Pretrained Model
+You can also use pretrained models we provide.<br/>
+[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
+Details of each folder are as in follows:
+| Folder Name  | Generator | Dataset   | Fine-Tuned                                             |
+| ------------ | --------- | --------- | ------------------------------------------------------ |
+| LJ_V1        | V1        | LJSpeech  | No                                                     |
+| LJ_V2        | V2        | LJSpeech  | No                                                     |
+| LJ_V3        | V3        | LJSpeech  | No                                                     |
+| LJ_FT_T2_V1  | V1        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
+| LJ_FT_T2_V2  | V2        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
+| LJ_FT_T2_V3  | V3        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
+| VCTK_V1      | V1        | VCTK      | No                                                     |
+| VCTK_V2      | V2        | VCTK      | No                                                     |
+| VCTK_V3      | V3        | VCTK      | No                                                     |
+| UNIVERSAL_V1 | V1        | Universal | No                                                     |
+We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
+## Fine-Tuning
+1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
+   The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
+   Example:
+   `   Audio File : LJ001-0001.wav
+Mel-Spectrogram File : LJ001-0001.npy`
+2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
+3. Run the following command.
+   ```
+   python train.py --fine_tuning True --config config_v1.json
+   ```
+   For other command line options, please refer to the training section.
+## Inference from wav file
+1. Make `test_files` directory and copy wav files into the directory.
+2. Run the following command.
+   `   python inference.py --checkpoint_file [generator checkpoint file path]`
+   Generated wav files are saved in `generated_files` by default.<br>
+   You can change the path by adding `--output_dir` option.
+## Inference for end-to-end speech synthesis
+1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
+   You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
+   [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
+2. Run the following command.
+   `   python inference_e2e.py --checkpoint_file [generator checkpoint file path]`
+   Generated wav files are saved in `generated_files_from_mel` by default.<br>
+   You can change the path by adding `--output_dir` option.
+## Acknowledgements
+We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
+and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.

third_party/Matcha-TTS/matcha/hifigan/__init__.py ADDED Viewed

File without changes

third_party/Matcha-TTS/matcha/hifigan/config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+v1 = {
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0004,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8, 8, 2, 2],
+    "upsample_kernel_sizes": [16, 16, 4, 4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3, 7, 11],
+    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+    "resblock_initial_channel": 256,
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_loss": None,
+    "num_workers": 4,
+    "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1},
+}

third_party/Matcha-TTS/matcha/hifigan/denoiser.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py
+"""Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio."""
+import torch
+class Denoiser(torch.nn.Module):
+    """Removes model bias from audio produced with waveglow"""
+    def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = int(filter_length / n_overlap)
+        self.win_length = win_length
+        dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device
+        self.device = device
+        if mode == "zeros":
+            mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
+        elif mode == "normal":
+            mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
+        else:
+            raise Exception(f"Mode {mode} if not supported")
+        def stft_fn(audio, n_fft, hop_length, win_length, window):
+            spec = torch.stft(
+                audio,
+                n_fft=n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
+                return_complex=True,
+            )
+            spec = torch.view_as_real(spec)
+            return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
+        self.stft = lambda x: stft_fn(
+            audio=x,
+            n_fft=self.filter_length,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=torch.hann_window(self.win_length, device=device),
+        )
+        self.istft = lambda x, y: torch.istft(
+            torch.complex(x * torch.cos(y), x * torch.sin(y)),
+            n_fft=self.filter_length,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=torch.hann_window(self.win_length, device=device),
+        )
+        with torch.no_grad():
+            bias_audio = vocoder(mel_input).float().squeeze(0)
+            bias_spec, _ = self.stft(bias_audio)
+        self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
+    @torch.inference_mode()
+    def forward(self, audio, strength=0.0005):
+        audio_spec, audio_angles = self.stft(audio)
+        audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength
+        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+        audio_denoised = self.istft(audio_spec_denoised, audio_angles)
+        return audio_denoised

third_party/Matcha-TTS/matcha/hifigan/env.py ADDED Viewed

	@@ -0,0 +1,17 @@

+""" from https://github.com/jik876/hifi-gan """
+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

third_party/Matcha-TTS/matcha/hifigan/models.py ADDED Viewed

	@@ -0,0 +1,368 @@

+""" from https://github.com/jik876/hifi-gan """
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+from .xutils import get_padding, init_weights
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super().__init__()
+        self.h = h
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super().__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for _, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super().__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for _, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super().__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

third_party/Matcha-TTS/matcha/models/__init__.py ADDED Viewed

File without changes

third_party/Matcha-TTS/matcha/models/baselightningmodule.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+This is a base lightning module that can be used to train a model.
+The benefit of this abstraction is that all the logic outside of model definition can be reused for different models.
+"""
+import inspect
+from abc import ABC
+from typing import Any, Dict
+import torch
+from lightning import LightningModule
+from lightning.pytorch.utilities import grad_norm
+from matcha import utils
+from matcha.utils.utils import plot_tensor
+log = utils.get_pylogger(__name__)
+class BaseLightningClass(LightningModule, ABC):
+    def update_data_statistics(self, data_statistics):
+        if data_statistics is None:
+            data_statistics = {
+                "mel_mean": 0.0,
+                "mel_std": 1.0,
+            }
+        self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"]))
+        self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"]))
+    def configure_optimizers(self) -> Any:
+        optimizer = self.hparams.optimizer(params=self.parameters())
+        if self.hparams.scheduler not in (None, {}):
+            scheduler_args = {}
+            # Manage last epoch for exponential schedulers
+            if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters:
+                if hasattr(self, "ckpt_loaded_epoch"):
+                    current_epoch = self.ckpt_loaded_epoch - 1
+                else:
+                    current_epoch = -1
+            scheduler_args.update({"optimizer": optimizer})
+            scheduler = self.hparams.scheduler.scheduler(**scheduler_args)
+            scheduler.last_epoch = current_epoch
+            return {
+                "optimizer": optimizer,
+                "lr_scheduler": {
+                    "scheduler": scheduler,
+                    "interval": self.hparams.scheduler.lightning_args.interval,
+                    "frequency": self.hparams.scheduler.lightning_args.frequency,
+                    "name": "learning_rate",
+                },
+            }
+        return {"optimizer": optimizer}
+    def get_losses(self, batch):
+        x, x_lengths = batch["x"], batch["x_lengths"]
+        y, y_lengths = batch["y"], batch["y_lengths"]
+        spks = batch["spks"]
+        dur_loss, prior_loss, diff_loss = self(
+            x=x,
+            x_lengths=x_lengths,
+            y=y,
+            y_lengths=y_lengths,
+            spks=spks,
+            out_size=self.out_size,
+        )
+        return {
+            "dur_loss": dur_loss,
+            "prior_loss": prior_loss,
+            "diff_loss": diff_loss,
+        }
+    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        self.ckpt_loaded_epoch = checkpoint["epoch"]  # pylint: disable=attribute-defined-outside-init
+    def training_step(self, batch: Any, batch_idx: int):
+        loss_dict = self.get_losses(batch)
+        self.log(
+            "step",
+            float(self.global_step),
+            on_step=True,
+            prog_bar=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/train_dur_loss",
+            loss_dict["dur_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/train_prior_loss",
+            loss_dict["prior_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/train_diff_loss",
+            loss_dict["diff_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        total_loss = sum(loss_dict.values())
+        self.log(
+            "loss/train",
+            total_loss,
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+        return {"loss": total_loss, "log": loss_dict}
+    def validation_step(self, batch: Any, batch_idx: int):
+        loss_dict = self.get_losses(batch)
+        self.log(
+            "sub_loss/val_dur_loss",
+            loss_dict["dur_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/val_prior_loss",
+            loss_dict["prior_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/val_diff_loss",
+            loss_dict["diff_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        total_loss = sum(loss_dict.values())
+        self.log(
+            "loss/val",
+            total_loss,
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+        return total_loss
+    def on_validation_end(self) -> None:
+        if self.trainer.is_global_zero:
+            one_batch = next(iter(self.trainer.val_dataloaders))
+            if self.current_epoch == 0:
+                log.debug("Plotting original samples")
+                for i in range(2):
+                    y = one_batch["y"][i].unsqueeze(0).to(self.device)
+                    self.logger.experiment.add_image(
+                        f"original/{i}",
+                        plot_tensor(y.squeeze().cpu()),
+                        self.current_epoch,
+                        dataformats="HWC",
+                    )
+            log.debug("Synthesising...")
+            for i in range(2):
+                x = one_batch["x"][i].unsqueeze(0).to(self.device)
+                x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device)
+                spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None
+                output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks)
+                y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"]
+                attn = output["attn"]
+                self.logger.experiment.add_image(
+                    f"generated_enc/{i}",
+                    plot_tensor(y_enc.squeeze().cpu()),
+                    self.current_epoch,
+                    dataformats="HWC",
+                )
+                self.logger.experiment.add_image(
+                    f"generated_dec/{i}",
+                    plot_tensor(y_dec.squeeze().cpu()),
+                    self.current_epoch,
+                    dataformats="HWC",
+                )
+                self.logger.experiment.add_image(
+                    f"alignment/{i}",
+                    plot_tensor(attn.squeeze().cpu()),
+                    self.current_epoch,
+                    dataformats="HWC",
+                )
+    def on_before_optimizer_step(self, optimizer):
+        self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()})

third_party/Matcha-TTS/matcha/models/components/__init__.py ADDED Viewed

File without changes

third_party/Matcha-TTS/matcha/models/components/flow_matching.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from abc import ABC
+import torch
+import torch.nn.functional as F
+from matcha.models.components.decoder import Decoder
+from matcha.utils.pylogger import get_pylogger
+log = get_pylogger(__name__)
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        n_feats,
+        cfm_params,
+        n_spks=1,
+        spk_emb_dim=128,
+    ):
+        super().__init__()
+        self.n_feats = n_feats
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.solver = cfm_params.solver
+        if hasattr(cfm_params, "sigma_min"):
+            self.sigma_min = cfm_params.sigma_min
+        else:
+            self.sigma_min = 1e-4
+        self.estimator = None
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
+            torch.sum(mask) * u.shape[1]
+        )
+        return loss, y
+class CFM(BASECFM):
+    def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params)

third_party/Matcha-TTS/matcha/models/matcha_tts.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import datetime as dt
+import math
+import random
+import torch
+import matcha.utils.monotonic_align as monotonic_align
+from matcha import utils
+from matcha.models.baselightningmodule import BaseLightningClass
+from matcha.models.components.flow_matching import CFM
+from matcha.models.components.text_encoder import TextEncoder
+from matcha.utils.model import (
+    denormalize,
+    duration_loss,
+    fix_len_compatibility,
+    generate_path,
+    sequence_mask,
+)
+log = utils.get_pylogger(__name__)
+class MatchaTTS(BaseLightningClass):  # 🍵
+    def __init__(
+        self,
+        n_vocab,
+        n_spks,
+        spk_emb_dim,
+        n_feats,
+        encoder,
+        decoder,
+        cfm,
+        data_statistics,
+        out_size,
+        optimizer=None,
+        scheduler=None,
+        prior_loss=True,
+    ):
+        super().__init__()
+        self.save_hyperparameters(logger=False)
+        self.n_vocab = n_vocab
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.n_feats = n_feats
+        self.out_size = out_size
+        self.prior_loss = prior_loss
+        if n_spks > 1:
+            self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim)
+        self.encoder = TextEncoder(
+            encoder.encoder_type,
+            encoder.encoder_params,
+            encoder.duration_predictor_params,
+            n_vocab,
+            n_spks,
+            spk_emb_dim,
+        )
+        self.decoder = CFM(
+            in_channels=2 * encoder.encoder_params.n_feats,
+            out_channel=encoder.encoder_params.n_feats,
+            cfm_params=cfm,
+            decoder_params=decoder,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.update_data_statistics(data_statistics)
+    @torch.inference_mode()
+    def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, spks=None, length_scale=1.0):
+        """
+        Generates mel-spectrogram from text. Returns:
+            1. encoder outputs
+            2. decoder outputs
+            3. generated alignment
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+                shape: (batch_size, max_text_length)
+            x_lengths (torch.Tensor): lengths of texts in batch.
+                shape: (batch_size,)
+            n_timesteps (int): number of steps to use for reverse diffusion in decoder.
+            temperature (float, optional): controls variance of terminal distribution.
+            spks (bool, optional): speaker ids.
+                shape: (batch_size,)
+            length_scale (float, optional): controls speech pace.
+                Increase value to slow down generated speech and vice versa.
+        Returns:
+            dict: {
+                "encoder_outputs": torch.Tensor, shape: (batch_size, n_feats, max_mel_length),
+                # Average mel spectrogram generated by the encoder
+                "decoder_outputs": torch.Tensor, shape: (batch_size, n_feats, max_mel_length),
+                # Refined mel spectrogram improved by the CFM
+                "attn": torch.Tensor, shape: (batch_size, max_text_length, max_mel_length),
+                # Alignment map between text and mel spectrogram
+                "mel": torch.Tensor, shape: (batch_size, n_feats, max_mel_length),
+                # Denormalized mel spectrogram
+                "mel_lengths": torch.Tensor, shape: (batch_size,),
+                # Lengths of mel spectrograms
+                "rtf": float,
+                # Real-time factor
+        """
+        # For RTF computation
+        t = dt.datetime.now()
+        if self.n_spks > 1:
+            # Get speaker embedding
+            spks = self.spk_emb(spks.long())
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spks)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = y_lengths.max()
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Generate sample tracing the probability flow
+        decoder_outputs = self.decoder(mu_y, y_mask, n_timesteps, temperature, spks)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        t = (dt.datetime.now() - t).total_seconds()
+        rtf = t * 22050 / (decoder_outputs.shape[-1] * 256)
+        return {
+            "encoder_outputs": encoder_outputs,
+            "decoder_outputs": decoder_outputs,
+            "attn": attn[:, :, :y_max_length],
+            "mel": denormalize(decoder_outputs, self.mel_mean, self.mel_std),
+            "mel_lengths": y_lengths,
+            "rtf": rtf,
+        }
+    def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=None):
+        """
+        Computes 3 losses:
+            1. duration loss: loss between predicted token durations and those extracted by Monotinic Alignment Search (MAS).
+            2. prior loss: loss between mel-spectrogram and encoder outputs.
+            3. flow matching loss: loss between mel-spectrogram and decoder outputs.
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+                shape: (batch_size, max_text_length)
+            x_lengths (torch.Tensor): lengths of texts in batch.
+                shape: (batch_size,)
+            y (torch.Tensor): batch of corresponding mel-spectrograms.
+                shape: (batch_size, n_feats, max_mel_length)
+            y_lengths (torch.Tensor): lengths of mel-spectrograms in batch.
+                shape: (batch_size,)
+            out_size (int, optional): length (in mel's sampling rate) of segment to cut, on which decoder will be trained.
+                Should be divisible by 2^{num of UNet downsamplings}. Needed to increase batch size.
+            spks (torch.Tensor, optional): speaker ids.
+                shape: (batch_size,)
+        """
+        if self.n_spks > 1:
+            # Get speaker embedding
+            spks = self.spk_emb(spks)
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spks)
+        y_max_length = y.shape[-1]
+        y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
+        with torch.no_grad():
+            const = -0.5 * math.log(2 * math.pi) * self.n_feats
+            factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device)
+            y_square = torch.matmul(factor.transpose(1, 2), y**2)
+            y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y)
+            mu_square = torch.sum(factor * (mu_x**2), 1).unsqueeze(-1)
+            log_prior = y_square - y_mu_double + mu_square + const
+            attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1))
+            attn = attn.detach()
+        # Compute loss between predicted log-scaled durations and those obtained from MAS
+        # refered to as prior loss in the paper
+        logw_ = torch.log(1e-8 + torch.sum(attn.unsqueeze(1), -1)) * x_mask
+        dur_loss = duration_loss(logw, logw_, x_lengths)
+        # Cut a small segment of mel-spectrogram in order to increase batch size
+        #   - "Hack" taken from Grad-TTS, in case of Grad-TTS, we cannot train batch size 32 on a 24GB GPU without it
+        #   - Do not need this hack for Matcha-TTS, but it works with it as well
+        if not isinstance(out_size, type(None)):
+            max_offset = (y_lengths - out_size).clamp(0)
+            offset_ranges = list(zip([0] * max_offset.shape[0], max_offset.cpu().numpy()))
+            out_offset = torch.LongTensor(
+                [torch.tensor(random.choice(range(start, end)) if end > start else 0) for start, end in offset_ranges]
+            ).to(y_lengths)
+            attn_cut = torch.zeros(attn.shape[0], attn.shape[1], out_size, dtype=attn.dtype, device=attn.device)
+            y_cut = torch.zeros(y.shape[0], self.n_feats, out_size, dtype=y.dtype, device=y.device)
+            y_cut_lengths = []
+            for i, (y_, out_offset_) in enumerate(zip(y, out_offset)):
+                y_cut_length = out_size + (y_lengths[i] - out_size).clamp(None, 0)
+                y_cut_lengths.append(y_cut_length)
+                cut_lower, cut_upper = out_offset_, out_offset_ + y_cut_length
+                y_cut[i, :, :y_cut_length] = y_[:, cut_lower:cut_upper]
+                attn_cut[i, :, :y_cut_length] = attn[i, :, cut_lower:cut_upper]
+            y_cut_lengths = torch.LongTensor(y_cut_lengths)
+            y_cut_mask = sequence_mask(y_cut_lengths).unsqueeze(1).to(y_mask)
+            attn = attn_cut
+            y = y_cut
+            y_mask = y_cut_mask
+        # Align encoded text with mel-spectrogram and get mu_y segment
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        # Compute loss of the decoder
+        diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond)
+        if self.prior_loss:
+            prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
+            prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats)
+        else:
+            prior_loss = 0
+        return dur_loss, prior_loss, diff_loss

third_party/Matcha-TTS/matcha/onnx/__init__.py ADDED Viewed

File without changes

third_party/Matcha-TTS/matcha/onnx/export.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import argparse
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from lightning import LightningModule
+from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder
+DEFAULT_OPSET = 15
+SEED = 1234
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+torch.cuda.manual_seed(SEED)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+class MatchaWithVocoder(LightningModule):
+    def __init__(self, matcha, vocoder):
+        super().__init__()
+        self.matcha = matcha
+        self.vocoder = vocoder
+    def forward(self, x, x_lengths, scales, spks=None):
+        mel, mel_lengths = self.matcha(x, x_lengths, scales, spks)
+        wavs = self.vocoder(mel).clamp(-1, 1)
+        lengths = mel_lengths * 256
+        return wavs.squeeze(1), lengths
+def get_exportable_module(matcha, vocoder, n_timesteps):
+    """
+    Return an appropriate `LighteningModule` and output-node names
+    based on whether the vocoder is embedded in  the final graph
+    """
+    def onnx_forward_func(x, x_lengths, scales, spks=None):
+        """
+        Custom forward function for accepting
+        scaler parameters as tensors
+        """
+        # Extract scaler parameters from tensors
+        temperature = scales[0]
+        length_scale = scales[1]
+        output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale)
+        return output["mel"], output["mel_lengths"]
+    # Monkey-patch Matcha's forward function
+    matcha.forward = onnx_forward_func
+    if vocoder is None:
+        model, output_names = matcha, ["mel", "mel_lengths"]
+    else:
+        model = MatchaWithVocoder(matcha, vocoder)
+        output_names = ["wav", "wav_lengths"]
+    return model, output_names
+def get_inputs(is_multi_speaker):
+    """
+    Create dummy inputs for tracing
+    """
+    dummy_input_length = 50
+    x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long)
+    x_lengths = torch.LongTensor([dummy_input_length])
+    # Scales
+    temperature = 0.667
+    length_scale = 1.0
+    scales = torch.Tensor([temperature, length_scale])
+    model_inputs = [x, x_lengths, scales]
+    input_names = [
+        "x",
+        "x_lengths",
+        "scales",
+    ]
+    if is_multi_speaker:
+        spks = torch.LongTensor([1])
+        model_inputs.append(spks)
+        input_names.append("spks")
+    return tuple(model_inputs), input_names
+def main():
+    parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX")
+    parser.add_argument(
+        "checkpoint_path",
+        type=str,
+        help="Path to the model checkpoint",
+    )
+    parser.add_argument("output", type=str, help="Path to output `.onnx` file")
+    parser.add_argument(
+        "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)"
+    )
+    parser.add_argument(
+        "--vocoder-name",
+        type=str,
+        choices=list(VOCODER_URLS.keys()),
+        default=None,
+        help="Name of the vocoder to embed in the ONNX graph",
+    )
+    parser.add_argument(
+        "--vocoder-checkpoint-path",
+        type=str,
+        default=None,
+        help="Vocoder checkpoint to embed  in the ONNX graph for an `e2e` like experience",
+    )
+    parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15")
+    args = parser.parse_args()
+    print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}")
+    print(f"Setting n_timesteps to {args.n_timesteps}")
+    checkpoint_path = Path(args.checkpoint_path)
+    matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu")
+    if args.vocoder_name or args.vocoder_checkpoint_path:
+        assert (
+            args.vocoder_name and args.vocoder_checkpoint_path
+        ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph."
+        vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu")
+    else:
+        vocoder = None
+    is_multi_speaker = matcha.n_spks > 1
+    dummy_input, input_names = get_inputs(is_multi_speaker)
+    model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps)
+    # Set dynamic shape for inputs/outputs
+    dynamic_axes = {
+        "x": {0: "batch_size", 1: "time"},
+        "x_lengths": {0: "batch_size"},
+    }
+    if vocoder is None:
+        dynamic_axes.update(
+            {
+                "mel": {0: "batch_size", 2: "time"},
+                "mel_lengths": {0: "batch_size"},
+            }
+        )
+    else:
+        print("Embedding the vocoder in the ONNX graph")
+        dynamic_axes.update(
+            {
+                "wav": {0: "batch_size", 1: "time"},
+                "wav_lengths": {0: "batch_size"},
+            }
+        )
+    if is_multi_speaker:
+        dynamic_axes["spks"] = {0: "batch_size"}
+    # Create the output directory (if not exists)
+    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+    model.to_onnx(
+        args.output,
+        dummy_input,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+        opset_version=args.opset,
+        export_params=True,
+        do_constant_folding=True,
+    )
+    print(f"[🍵] ONNX model exported to  {args.output}")
+if __name__ == "__main__":
+    main()

third_party/Matcha-TTS/matcha/text/cleaners.py ADDED Viewed

	@@ -0,0 +1,116 @@

+""" from https://github.com/keithito/tacotron
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+import logging
+import re
+import phonemizer
+import piper_phonemize
+from unidecode import unidecode
+# To avoid excessive logging we set the log level of the phonemizer package to Critical
+critical_logger = logging.getLogger("phonemizer")
+critical_logger.setLevel(logging.CRITICAL)
+# Intializing the phonemizer globally significantly reduces the speed
+# now the phonemizer is not initialising at every call
+# Might be less flexible, but it is much-much faster
+global_phonemizer = phonemizer.backend.EspeakBackend(
+    language="en-us",
+    preserve_punctuation=True,
+    with_stress=True,
+    language_switch="remove-flags",
+    logger=critical_logger,
+)
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners2(text):
+    """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_abbreviations(text)
+    phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0]
+    phonemes = collapse_whitespace(phonemes)
+    return phonemes
+def english_cleaners_piper(text):
+    """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_abbreviations(text)
+    phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0])
+    phonemes = collapse_whitespace(phonemes)
+    return phonemes

third_party/Matcha-TTS/matcha/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers
+from matcha.utils.logging_utils import log_hyperparameters
+from matcha.utils.pylogger import get_pylogger
+from matcha.utils.rich_utils import enforce_tags, print_config_tree
+from matcha.utils.utils import extras, get_metric_value, task_wrapper

third_party/Matcha-TTS/matcha/utils/instantiators.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import List
+import hydra
+from lightning import Callback
+from lightning.pytorch.loggers import Logger
+from omegaconf import DictConfig
+from matcha.utils import pylogger
+log = pylogger.get_pylogger(__name__)
+def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
+    """Instantiates callbacks from config.
+    :param callbacks_cfg: A DictConfig object containing callback configurations.
+    :return: A list of instantiated callbacks.
+    """
+    callbacks: List[Callback] = []
+    if not callbacks_cfg:
+        log.warning("No callback configs found! Skipping..")
+        return callbacks
+    if not isinstance(callbacks_cfg, DictConfig):
+        raise TypeError("Callbacks config must be a DictConfig!")
+    for _, cb_conf in callbacks_cfg.items():
+        if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf:
+            log.info(f"Instantiating callback <{cb_conf._target_}>")  # pylint: disable=protected-access
+            callbacks.append(hydra.utils.instantiate(cb_conf))
+    return callbacks
+def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
+    """Instantiates loggers from config.
+    :param logger_cfg: A DictConfig object containing logger configurations.
+    :return: A list of instantiated loggers.
+    """
+    logger: List[Logger] = []
+    if not logger_cfg:
+        log.warning("No logger configs found! Skipping...")
+        return logger
+    if not isinstance(logger_cfg, DictConfig):
+        raise TypeError("Logger config must be a DictConfig!")
+    for _, lg_conf in logger_cfg.items():
+        if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf:
+            log.info(f"Instantiating logger <{lg_conf._target_}>")  # pylint: disable=protected-access
+            logger.append(hydra.utils.instantiate(lg_conf))
+    return logger

third_party/Matcha-TTS/matcha/utils/model.py ADDED Viewed

	@@ -0,0 +1,90 @@

+""" from https://github.com/jaywalnut310/glow-tts """
+import numpy as np
+import torch
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def fix_len_compatibility(length, num_downsamplings_in_unet=2):
+    factor = torch.scalar_tensor(2).pow(num_downsamplings_in_unet)
+    length = (length / factor).ceil() * factor
+    if not torch.onnx.is_in_onnx_export():
+        return length.int().item()
+    else:
+        return length
+def convert_pad_shape(pad_shape):
+    inverted_shape = pad_shape[::-1]
+    pad_shape = [item for sublist in inverted_shape for item in sublist]
+    return pad_shape
+def generate_path(duration, mask):
+    device = duration.device
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path * mask
+    return path
+def duration_loss(logw, logw_, lengths):
+    loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
+    return loss
+def normalize(data, mu, std):
+    if not isinstance(mu, (float, int)):
+        if isinstance(mu, list):
+            mu = torch.tensor(mu, dtype=data.dtype, device=data.device)
+        elif isinstance(mu, torch.Tensor):
+            mu = mu.to(data.device)
+        elif isinstance(mu, np.ndarray):
+            mu = torch.from_numpy(mu).to(data.device)
+        mu = mu.unsqueeze(-1)
+    if not isinstance(std, (float, int)):
+        if isinstance(std, list):
+            std = torch.tensor(std, dtype=data.dtype, device=data.device)
+        elif isinstance(std, torch.Tensor):
+            std = std.to(data.device)
+        elif isinstance(std, np.ndarray):
+            std = torch.from_numpy(std).to(data.device)
+        std = std.unsqueeze(-1)
+    return (data - mu) / std
+def denormalize(data, mu, std):
+    if not isinstance(mu, float):
+        if isinstance(mu, list):
+            mu = torch.tensor(mu, dtype=data.dtype, device=data.device)
+        elif isinstance(mu, torch.Tensor):
+            mu = mu.to(data.device)
+        elif isinstance(mu, np.ndarray):
+            mu = torch.from_numpy(mu).to(data.device)
+        mu = mu.unsqueeze(-1)
+    if not isinstance(std, float):
+        if isinstance(std, list):
+            std = torch.tensor(std, dtype=data.dtype, device=data.device)
+        elif isinstance(std, torch.Tensor):
+            std = std.to(data.device)
+        elif isinstance(std, np.ndarray):
+            std = torch.from_numpy(std).to(data.device)
+        std = std.unsqueeze(-1)
+    return data * std + mu

third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# from distutils.core import setup
+# from Cython.Build import cythonize
+# import numpy
+# setup(name='monotonic_align',
+#       ext_modules=cythonize("core.pyx"),
+#       include_dirs=[numpy.get_include()])

third_party/Matcha-TTS/matcha/utils/rich_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from pathlib import Path
+from typing import Sequence
+import rich
+import rich.syntax
+import rich.tree
+from hydra.core.hydra_config import HydraConfig
+from lightning.pytorch.utilities import rank_zero_only
+from omegaconf import DictConfig, OmegaConf, open_dict
+from rich.prompt import Prompt
+from matcha.utils import pylogger
+log = pylogger.get_pylogger(__name__)
+@rank_zero_only
+def print_config_tree(
+    cfg: DictConfig,
+    print_order: Sequence[str] = (
+        "data",
+        "model",
+        "callbacks",
+        "logger",
+        "trainer",
+        "paths",
+        "extras",
+    ),
+    resolve: bool = False,
+    save_to_file: bool = False,
+) -> None:
+    """Prints the contents of a DictConfig as a tree structure using the Rich library.
+    :param cfg: A DictConfig composed by Hydra.
+    :param print_order: Determines in what order config components are printed. Default is ``("data", "model",
+    "callbacks", "logger", "trainer", "paths", "extras")``.
+    :param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``.
+    :param save_to_file: Whether to export config to the hydra output folder. Default is ``False``.
+    """
+    style = "dim"
+    tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
+    queue = []
+    # add fields from `print_order` to queue
+    for field in print_order:
+        _ = (
+            queue.append(field)
+            if field in cfg
+            else log.warning(f"Field '{field}' not found in config. Skipping '{field}' config printing...")
+        )
+    # add all the other fields to queue (not specified in `print_order`)
+    for field in cfg:
+        if field not in queue:
+            queue.append(field)
+    # generate config tree from queue
+    for field in queue:
+        branch = tree.add(field, style=style, guide_style=style)
+        config_group = cfg[field]
+        if isinstance(config_group, DictConfig):
+            branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
+        else:
+            branch_content = str(config_group)
+        branch.add(rich.syntax.Syntax(branch_content, "yaml"))
+    # print config tree
+    rich.print(tree)
+    # save config tree to file
+    if save_to_file:
+        with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
+            rich.print(tree, file=file)
+@rank_zero_only
+def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
+    """Prompts user to input tags from command line if no tags are provided in config.
+    :param cfg: A DictConfig composed by Hydra.
+    :param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``.
+    """
+    if not cfg.get("tags"):
+        if "id" in HydraConfig().cfg.hydra.job:
+            raise ValueError("Specify tags before launching a multirun!")
+        log.warning("No tags provided in config. Prompting user to input tags...")
+        tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
+        tags = [t.strip() for t in tags.split(",") if t != ""]
+        with open_dict(cfg):
+            cfg.tags = tags
+        log.info(f"Tags: {cfg.tags}")
+    if save_to_file:
+        with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
+            rich.print(cfg.tags, file=file)

third_party/Matcha-TTS/matcha/utils/utils.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import os
+import sys
+import warnings
+from importlib.util import find_spec
+from pathlib import Path
+from typing import Any, Callable, Dict, Tuple
+import gdown
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import wget
+from omegaconf import DictConfig
+from matcha.utils import pylogger, rich_utils
+log = pylogger.get_pylogger(__name__)
+def extras(cfg: DictConfig) -> None:
+    """Applies optional utilities before the task is started.
+    Utilities:
+        - Ignoring python warnings
+        - Setting tags from command line
+        - Rich config printing
+    :param cfg: A DictConfig object containing the config tree.
+    """
+    # return if no `extras` config
+    if not cfg.get("extras"):
+        log.warning("Extras config not found! <cfg.extras=null>")
+        return
+    # disable python warnings
+    if cfg.extras.get("ignore_warnings"):
+        log.info("Disabling python warnings! <cfg.extras.ignore_warnings=True>")
+        warnings.filterwarnings("ignore")
+    # prompt user to input tags from command line if none are provided in the config
+    if cfg.extras.get("enforce_tags"):
+        log.info("Enforcing tags! <cfg.extras.enforce_tags=True>")
+        rich_utils.enforce_tags(cfg, save_to_file=True)
+    # pretty print config tree using Rich library
+    if cfg.extras.get("print_config"):
+        log.info("Printing config tree with Rich! <cfg.extras.print_config=True>")
+        rich_utils.print_config_tree(cfg, resolve=True, save_to_file=True)
+def task_wrapper(task_func: Callable) -> Callable:
+    """Optional decorator that controls the failure behavior when executing the task function.
+    This wrapper can be used to:
+        - make sure loggers are closed even if the task function raises an exception (prevents multirun failure)
+        - save the exception to a `.log` file
+        - mark the run as failed with a dedicated file in the `logs/` folder (so we can find and rerun it later)
+        - etc. (adjust depending on your needs)
+    Example:
+    ```
+    @utils.task_wrapper
+    def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        ...
+        return metric_dict, object_dict
+    ```
+    :param task_func: The task function to be wrapped.
+    :return: The wrapped task function.
+    """
+    def wrap(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        # execute the task
+        try:
+            metric_dict, object_dict = task_func(cfg=cfg)
+        # things to do if exception occurs
+        except Exception as ex:
+            # save exception to `.log` file
+            log.exception("")
+            # some hyperparameter combinations might be invalid or cause out-of-memory errors
+            # so when using hparam search plugins like Optuna, you might want to disable
+            # raising the below exception to avoid multirun failure
+            raise ex
+        # things to always do after either success or exception
+        finally:
+            # display output dir path in terminal
+            log.info(f"Output dir: {cfg.paths.output_dir}")
+            # always close wandb run (even if exception occurs so multirun won't fail)
+            if find_spec("wandb"):  # check if wandb is installed
+                import wandb
+                if wandb.run:
+                    log.info("Closing wandb!")
+                    wandb.finish()
+        return metric_dict, object_dict
+    return wrap
+def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> float:
+    """Safely retrieves value of the metric logged in LightningModule.
+    :param metric_dict: A dict containing metric values.
+    :param metric_name: The name of the metric to retrieve.
+    :return: The value of the metric.
+    """
+    if not metric_name:
+        log.info("Metric name is None! Skipping metric value retrieval...")
+        return None
+    if metric_name not in metric_dict:
+        raise ValueError(
+            f"Metric value not found! <metric_name={metric_name}>\n"
+            "Make sure metric name logged in LightningModule is correct!\n"
+            "Make sure `optimized_metric` name in `hparams_search` config is correct!"
+        )
+    metric_value = metric_dict[metric_name].item()
+    log.info(f"Retrieved metric value! <{metric_name}={metric_value}>")
+    return metric_value
+def intersperse(lst, item):
+    # Adds blank symbol
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+def save_figure_to_numpy(fig):
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    return data
+def plot_tensor(tensor):
+    plt.style.use("default")
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+def save_plot(tensor, savepath):
+    plt.style.use("default")
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.savefig(savepath)
+    plt.close()
+def to_numpy(tensor):
+    if isinstance(tensor, np.ndarray):
+        return tensor
+    elif isinstance(tensor, torch.Tensor):
+        return tensor.detach().cpu().numpy()
+    elif isinstance(tensor, list):
+        return np.array(tensor)
+    else:
+        raise TypeError("Unsupported type for conversion to numpy array")
+def get_user_data_dir(appname="matcha_tts"):
+    """
+    Args:
+        appname (str): Name of application
+    Returns:
+        Path: path to user data directory
+    """
+    MATCHA_HOME = os.environ.get("MATCHA_HOME")
+    if MATCHA_HOME is not None:
+        ans = Path(MATCHA_HOME).expanduser().resolve(strict=False)
+    elif sys.platform == "win32":
+        import winreg  # pylint: disable=import-outside-toplevel
+        key = winreg.OpenKey(
+            winreg.HKEY_CURRENT_USER,
+            r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders",
+        )
+        dir_, _ = winreg.QueryValueEx(key, "Local AppData")
+        ans = Path(dir_).resolve(strict=False)
+    elif sys.platform == "darwin":
+        ans = Path("~/Library/Application Support/").expanduser()
+    else:
+        ans = Path.home().joinpath(".local/share")
+    final_path = ans.joinpath(appname)
+    final_path.mkdir(parents=True, exist_ok=True)
+    return final_path
+def assert_model_downloaded(checkpoint_path, url, use_wget=True):
+    if Path(checkpoint_path).exists():
+        log.debug(f"[+] Model already present at {checkpoint_path}!")
+        print(f"[+] Model already present at {checkpoint_path}!")
+        return
+    log.info(f"[-] Model not found at {checkpoint_path}! Will download it")
+    print(f"[-] Model not found at {checkpoint_path}! Will download it")
+    checkpoint_path = str(checkpoint_path)
+    if not use_wget:
+        gdown.download(url=url, output=checkpoint_path, quiet=False, fuzzy=True)
+    else:
+        wget.download(url=url, out=checkpoint_path)