diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..fdd121fa19601c78d55ec603ab80be3986afcc16 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+vocos/data/filelist.train filter=lfs diff=lfs merge=lfs -text
+vocos/data/filelist2.train filter=lfs diff=lfs merge=lfs -text
diff --git a/vocos/.github/workflows/pypi-release.yml b/vocos/.github/workflows/pypi-release.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f184b085bc478acfac31aa41202813ce555c1637
--- /dev/null
+++ b/vocos/.github/workflows/pypi-release.yml
@@ -0,0 +1,26 @@
+name: Publish Python package
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.x"
+ - name: Install pypa/setuptools
+ run: >-
+ python -m
+ pip install wheel
+ - name: Build a binary wheel
+ run: >-
+ python setup.py sdist bdist_wheel
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
diff --git a/vocos/.gitignore b/vocos/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d1cc3b4cf941644cd0353bc7c26da53c0780b
--- /dev/null
+++ b/vocos/.gitignore
@@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+logs/
+*.pt
+*.ckpt
\ No newline at end of file
diff --git a/vocos/LICENSE b/vocos/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..c37bdaf99c6921f5849425d546069e972f52d7fa
--- /dev/null
+++ b/vocos/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Charactr Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/vocos/README.md b/vocos/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc471c1aa0ce066e2a49574fad1eb8c94a25f792
--- /dev/null
+++ b/vocos/README.md
@@ -0,0 +1,124 @@
+# Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
+
+[Audio samples](https://gemelo-ai.github.io/vocos/) |
+Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
+
+Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
+Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
+GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
+coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
+
+## Installation
+
+To use Vocos only in inference mode, install it using:
+
+```bash
+pip install vocos
+```
+
+If you wish to train the model, install it with additional dependencies:
+
+```bash
+pip install vocos[train]
+```
+
+## Usage
+
+### Reconstruct audio from mel-spectrogram
+
+```python
+import torch
+
+from vocos import Vocos
+
+vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+
+mel = torch.randn(1, 100, 256) # B, C, T
+audio = vocos.decode(mel)
+```
+
+Copy-synthesis from a file:
+
+```python
+import torchaudio
+
+y, sr = torchaudio.load(YOUR_AUDIO_FILE)
+if y.size(0) > 1: # mix to mono
+ y = y.mean(dim=0, keepdim=True)
+y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
+y_hat = vocos(y)
+```
+
+### Reconstruct audio from EnCodec tokens
+
+Additionally, you need to provide a `bandwidth_id` which corresponds to the embedding for bandwidth from the
+list: `[1.5, 3.0, 6.0, 12.0]`.
+
+```python
+vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz")
+
+audio_tokens = torch.randint(low=0, high=1024, size=(8, 200)) # 8 codeboooks, 200 frames
+features = vocos.codes_to_features(audio_tokens)
+bandwidth_id = torch.tensor([2]) # 6 kbps
+
+audio = vocos.decode(features, bandwidth_id=bandwidth_id)
+```
+
+Copy-synthesis from a file: It extracts and quantizes features with EnCodec, then reconstructs them with Vocos in a
+single forward pass.
+
+```python
+y, sr = torchaudio.load(YOUR_AUDIO_FILE)
+if y.size(0) > 1: # mix to mono
+ y = y.mean(dim=0, keepdim=True)
+y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
+
+y_hat = vocos(y, bandwidth_id=bandwidth_id)
+```
+
+### Integrate with 🐶 [Bark](https://github.com/suno-ai/bark) text-to-audio model
+
+See [example notebook](notebooks%2FBark%2BVocos.ipynb).
+
+## Pre-trained models
+
+| Model Name | Dataset | Training Iterations | Parameters
+|-------------------------------------------------------------------------------------|---------------|-------------------|------------|
+| [charactr/vocos-mel-24khz](https://huggingface.co/charactr/vocos-mel-24khz) | LibriTTS | 1M | 13.5M
+| [charactr/vocos-encodec-24khz](https://huggingface.co/charactr/vocos-encodec-24khz) | DNS Challenge | 2M | 7.9M
+
+## Training
+
+Prepare a filelist of audio files for the training and validation set:
+
+```bash
+find $TRAIN_DATASET_DIR -name *.wav > filelist.train
+find $VAL_DATASET_DIR -name *.wav > filelist.val
+```
+
+Fill a config file, e.g. [vocos.yaml](configs%2Fvocos.yaml), with your filelist paths and start training with:
+
+```bash
+python train.py -c configs/vocos.yaml
+```
+
+Refer to [Pytorch Lightning documentation](https://lightning.ai/docs/pytorch/stable/) for details about customizing the
+training pipeline.
+
+## Citation
+
+If this code contributes to your research, please cite our work:
+
+```
+@article{siuzdak2023vocos,
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
+ author={Siuzdak, Hubert},
+ journal={arXiv preprint arXiv:2306.00814},
+ year={2023}
+}
+```
+
+## License
+
+The code in this repository is released under the MIT license as found in the
+[LICENSE](LICENSE) file.
diff --git a/vocos/configs/vocos-encodec.yaml b/vocos/configs/vocos-encodec.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d696ab70cd30def01cab5824b246b3f5286eb99
--- /dev/null
+++ b/vocos/configs/vocos-encodec.yaml
@@ -0,0 +1,86 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: ???
+ sampling_rate: 24000
+ num_samples: 24000
+ batch_size: 16
+ num_workers: 8
+
+ val_params:
+ filelist_path: ???
+ sampling_rate: 24000
+ num_samples: 24000
+ batch_size: 16
+ num_workers: 8
+
+model:
+ class_path: vocos.experiment.VocosEncodecExp
+ init_args:
+ sample_rate: 24000
+ initial_learning_rate: 5e-4
+ mel_loss_coeff: 45
+ mrd_loss_coeff: 1.0
+ num_warmup_steps: 0 # Optimizers warmup steps
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
+
+ # automatic evaluation
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+
+ feature_extractor:
+ class_path: vocos.feature_extractors.EncodecFeatures
+ init_args:
+ encodec_model: encodec_24khz
+ bandwidths: [1.5, 3.0, 6.0, 12.0]
+ train_codebooks: false
+
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 128
+ dim: 384
+ intermediate_dim: 1152
+ num_layers: 8
+ adanorm_num_embeddings: 4 # len(bandwidths)
+
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 384
+ n_fft: 1280
+ hop_length: 320
+ padding: same
+
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ monitor: val_loss
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ save_top_k: 3
+ save_last: true
+ - class_path: vocos.helpers.GradNormCallback
+
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
+ # This equals to 1M steps per generator and 1M per discriminator
+ max_steps: 2000000
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
+ limit_val_batches: 100
+ accelerator: gpu
+ strategy: ddp
+ devices: [0]
+ log_every_n_steps: 100
diff --git a/vocos/configs/vocos-imdct.yaml b/vocos/configs/vocos-imdct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b8f9b57d73faff121986683bfc38845b84b71b9
--- /dev/null
+++ b/vocos/configs/vocos-imdct.yaml
@@ -0,0 +1,86 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: ???
+ sampling_rate: 24000
+ num_samples: 16384
+ batch_size: 16
+ num_workers: 8
+
+ val_params:
+ filelist_path: ???
+ sampling_rate: 24000
+ num_samples: 48384
+ batch_size: 16
+ num_workers: 8
+
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ sample_rate: 24000
+ initial_learning_rate: 5e-4
+ mel_loss_coeff: 45
+ mrd_loss_coeff: 0.1
+ num_warmup_steps: 0 # Optimizers warmup steps
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
+
+ # automatic evaluation
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 24000
+ n_fft: 1024
+ hop_length: 256
+ n_mels: 100
+ padding: center
+
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 100
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+
+ head:
+ class_path: vocos.heads.IMDCTCosHead
+ init_args:
+ dim: 512
+ mdct_frame_len: 512 # mel-spec hop_length * 2
+ padding: center
+
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ monitor: val_loss
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ save_top_k: 3
+ save_last: true
+ - class_path: vocos.helpers.GradNormCallback
+
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
+ # This equals to 1M steps per generator and 1M per discriminator
+ max_steps: 2000000
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
+ limit_val_batches: 100
+ accelerator: gpu
+ strategy: ddp
+ devices: [0]
+ log_every_n_steps: 100
diff --git a/vocos/configs/vocos-resnet.yaml b/vocos/configs/vocos-resnet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db7d25a4ba013f06a366efa79f049186817e735b
--- /dev/null
+++ b/vocos/configs/vocos-resnet.yaml
@@ -0,0 +1,86 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: ???
+ sampling_rate: 24000
+ num_samples: 16384
+ batch_size: 16
+ num_workers: 8
+
+ val_params:
+ filelist_path: ???
+ sampling_rate: 24000
+ num_samples: 48384
+ batch_size: 16
+ num_workers: 8
+
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ sample_rate: 24000
+ initial_learning_rate: 5e-4
+ mel_loss_coeff: 45
+ mrd_loss_coeff: 0.1
+ num_warmup_steps: 0 # Optimizers warmup steps
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
+
+ # automatic evaluation
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 24000
+ n_fft: 1024
+ hop_length: 256
+ n_mels: 100
+ padding: center
+
+ backbone:
+ class_path: vocos.models.VocosResNetBackbone
+ init_args:
+ input_channels: 100
+ dim: 512
+ num_blocks: 3
+
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 1024
+ hop_length: 256
+ padding: center
+
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ monitor: val_loss
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ save_top_k: 3
+ save_last: true
+ - class_path: vocos.helpers.GradNormCallback
+
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
+ # This equals to 1M steps per generator and 1M per discriminator
+ max_steps: 2000000
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
+ limit_val_batches: 100
+ accelerator: gpu
+ strategy: ddp
+ devices: [0]
+ log_every_n_steps: 100
diff --git a/vocos/configs/vocos.yaml b/vocos/configs/vocos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c393c1adf27e1fd6ea5c0be5c6f7f950ba96a1d
--- /dev/null
+++ b/vocos/configs/vocos.yaml
@@ -0,0 +1,90 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: "/home/ubuntu/vocos/data/filelist.train"
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 58
+ num_workers: 8
+
+ val_params:
+ filelist_path: "/home/ubuntu/vocos/data/filelist.val"
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 16
+ num_workers: 8
+
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ sample_rate: 44100
+ initial_learning_rate: 5e-4
+ mel_loss_coeff: 45
+ mrd_loss_coeff: 0.1
+ num_warmup_steps: 0 # Optimizers warmup steps
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
+
+ # automatic evaluation
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 44100
+ n_fft: 2048
+ hop_length: 512
+ win_length: 2048
+ n_mels: 128
+ padding: center
+
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 128
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 512
+ padding: center
+
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ # every_n_train_steps: 5000
+ # filename: vocos_checkpoint_step_{step}
+ monitor: val_loss
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ save_top_k: 3
+ save_last: true
+ - class_path: vocos.helpers.GradNormCallback
+
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
+ # This equals to 1M steps per generator and 1M per discriminator
+ max_steps: 2000000
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
+ limit_val_batches: 50
+ accelerator: gpu
+ strategy: ddp
+ devices: [0,1]
+ log_every_n_steps: 100
diff --git a/vocos/configs/vocos24.yaml b/vocos/configs/vocos24.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0a305507e44826d2b5e83d397296f8e6b0ec159
--- /dev/null
+++ b/vocos/configs/vocos24.yaml
@@ -0,0 +1,90 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: "/home/ubuntu/vocos/data/filelist2.train"
+ sampling_rate: 24000
+ num_samples: 57600
+ batch_size: 64
+ num_workers: 8
+
+ val_params:
+ filelist_path: "/home/ubuntu/vocos/data/filelist.val"
+ sampling_rate: 24000
+ num_samples: 57600
+ batch_size: 16
+ num_workers: 8
+
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ sample_rate: 24000
+ initial_learning_rate: 5e-4
+ mel_loss_coeff: 45
+ mrd_loss_coeff: 0.1
+ num_warmup_steps: 0 # Optimizers warmup steps
+ pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
+
+ # automatic evaluation
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 24000
+ n_fft: 2048
+ hop_length: 300
+ win_length: 1200
+ n_mels: 80
+ padding: center
+
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 80
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 300
+ padding: center
+
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ # every_n_train_steps: 5000
+ # filename: vocos_checkpoint_step_{step}
+ monitor: val_loss
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ save_top_k: 3
+ save_last: true
+ - class_path: vocos.helpers.GradNormCallback
+
+ # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
+ # This equals to 1M steps per generator and 1M per discriminator
+ max_steps: 2000000
+ # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
+ limit_val_batches: 50
+ accelerator: gpu
+ strategy: ddp
+ devices: [0,1]
+ log_every_n_steps: 100
diff --git a/vocos/data/filelist.train b/vocos/data/filelist.train
new file mode 100644
index 0000000000000000000000000000000000000000..e250af3992c8595708b2cd8e2d031d494a29a6bf
--- /dev/null
+++ b/vocos/data/filelist.train
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:186a28c6524b34bcdfbbe01ed42257db16753ce1e35770385e004fb6ea7219b8
+size 150129204
diff --git a/vocos/data/filelist.val b/vocos/data/filelist.val
new file mode 100644
index 0000000000000000000000000000000000000000..f34bd9f66ee9902085174dffcfb03b1d37a05444
--- /dev/null
+++ b/vocos/data/filelist.val
@@ -0,0 +1,500 @@
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_003/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_003_chunk1424.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/النوم عند قدمي الجبل/النوم عند قدمي الجبل_chunk854.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/حجرتان وصالة/حجرتان وصالة_chunk715.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/СЕМЕЙНЫЙ ЮРИСТ. Развод, алименты и раздел имущества [gp2-6PZBZmU]/СЕМЕ_speaker_SPEAKER_00/СЕМЕ_speaker_SPEAKER_00_chunk513.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أبابيل/أبابيل_chunk979.mp3
+/home/ubuntu/respair/jpn/moe/2cf01874/wav/2cf01874_1845.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Is Amazon Good For Small Business? [x6hj-XeDKD4]/Is A_speaker_SPEAKER_05/Is A_speaker_SPEAKER_05_chunk93.wav
+/home/ubuntu/respair/data_cache/Final_Persian/رویا میرعلمی/dfe42e8c-acda-4b52-b0e6-8f02cdc9dbde/dfe42e8c-acda-4b52-b0e6-8f02cdc9dbde_chunk182.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/امیررضا علی زاده/1a762047-afb4-4b6a-a2ae-62b6e80edcbd/1a762047-afb4-4b6a-a2ae-62b6e80edcbd_chunk165.mp3
+/home/ubuntu/respair/jpn/moe/917feebd/wav/917feebd_2750.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ЛОГОПЕД. Развитие речи, постановка звуков, массаж и подрезание уздечки [ra3U7s-VZzI]/ЛОГО_speaker_SPEAKER_03/ЛОГО_speaker_SPEAKER_03_chunk183.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/НАСКОЛЬКО ТЫ МУЖИК? Про барбершопы, лысые яйца и давление общества [y2CUqJKfAY8]/НАСК_speaker_SPEAKER_05/НАСК_speaker_SPEAKER_05_chunk414.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/mayu/mayu_cgss/mayu_cgss_card_100270/mayu_cgss_voice_100270_2_03.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_13604.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сарко Де Рази - Украденный свет/Сарк_speaker_SPEAKER_00/Сарк_speaker_SPEAKER_00_chunk277.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/نون/نون_chunk636.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Муратов – что происходит с Россией [z1C01Gc9w-w]/Мура_speaker_SPEAKER_01/Мура_speaker_SPEAKER_01_chunk410.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Laezel/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_003/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_003_chunk866.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_expresso/Ylacombe_Expresso_audio_3273_P3.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/اليهودي والفتاة العربية/اليهودي والفتاة العربية_chunk1513.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/علامات الحب السبعة/علامات الحب السبعة_chunk560.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Василий Головачев - Спасатели Веера 1 - Посланник_part_002/Васи_speaker_SPEAKER_01/Васи_speaker_SPEAKER_01_chunk455.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/هدا صدر/7bdf73a0-1db6-413c-a363-4e0f55aa4433/7bdf73a0-1db6-413c-a363-4e0f55aa4433_chunk103.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/ranko/ranko_mobamas/ranko_mobamasu_0018/ranko_mobamasu_0018_chunk72.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/امرأة في مكان آخر/امرأة في مكان آخر_chunk1577.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sakurai_takahiro/Sakurai_Takahiro_01/Sakurai_Takahiro_01_chunk1470.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/قميص تكويه إمرأتان/قميص تكويه إمرأتان_chunk801.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/امیرمحمد صمصامی/5ad3f710-83cd-4775-9ec4-b72e514bebfc/5ad3f710-83cd-4775-9ec4-b72e514bebfc_chunk162.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/ruski_mix/Становясь волшебницей (1-13 серия) [Dreamcast] [BD 1080] -185087421_456245995 audio only/Стан_speaker_SPEAKER_07/Стан_speaker_SPEAKER_07_chunk946.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/ranko/ranko_cgss/ranko_card_200796/ranko_voice_200796_4_02.wav
+/home/ubuntu/respair/data_cache/Final_Persian/آیلار محمدی/5e44498f-d33b-46ae-b18b-dfd841a4b949/5e44498f-d33b-46ae-b18b-dfd841a4b949_chunk362.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/whispering_chunks/ASMR - ANNUAL EAR CLEANING [Hk9dtOkOPro]/ASMR - ANNUAL EAR CLEANING [Hk9dtOkOPro]_chunk262.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/ru_youtube/ru_youtube_dataset_audio_2010.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/طرق سرية للجموح/طرق سرية للجموح_chunk203.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/لیلا ولی پور/6437de3d-b9ae-4c82-a00e-991a14e3731b/6437de3d-b9ae-4c82-a00e-991a14e3731b_chunk82.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Thinking Twice: Revenge of the Tipping Point with Malcolm Gladwell [l2tCLI29S4k]/Thin_speaker_SPEAKER_04/Thin_speaker_SPEAKER_04_chunk410.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/تقتلني أو أكتبها/تقتلني أو أكتبها_chunk32.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_002/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_002_chunk1446.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/امیررضا علی زاده/6dfaaafb-65c5-4dbe-b908-40bc822d74bc/6dfaaafb-65c5-4dbe-b908-40bc822d74bc_chunk401.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/دفتر الغربة/دفتر الغربة_chunk1173.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Валерий Цуркан - Пять веков туда и обратно/Вале_speaker_SPEAKER_02/Вале_speaker_SPEAKER_02_chunk218.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/shiki/shiki_cgss/shiki_card_100101/shiki_voice_100101_2_05.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Анна Джейн_08/Анна Джейн_08_chunk276.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/فاطمه ساعدی/f658ca03-a5ef-4ee5-91e9-541c82206bd3/f658ca03-a5ef-4ee5-91e9-541c82206bd3_chunk305.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ЖИЗНЬ С ШИЗОФРЕНИЕЙ-2. Aline in Wonderland. Принудительная госпитализация [iTysDG98Tw8]/ЖИЗН_speaker_SPEAKER_01/ЖИЗН_speaker_SPEAKER_01_chunk530.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/99aaa9f7-ad53-449d-94d0-506c4557c240/99aaa9f7-ad53-449d-94d0-506c4557c240_chunk369.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/اللعبة/اللعبة_chunk265.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Тодд_P_44_Samye yarkie zvYozdy/Тодд_P_44_Samye yarkie zvYozdy_chunk16.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/سندريلات مسقط/سندريلات مسقط_chunk229.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/بیژن ارژن/54c54f7a-36c5-45be-97a3-47f5fb2e74b2/54c54f7a-36c5-45be-97a3-47f5fb2e74b2_chunk9.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Вадим Ечеистов - В тумане/Вади_speaker_SPEAKER_00/Вади_speaker_SPEAKER_00_chunk729.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/ещенепознер – до и во время войны ⧸ before and during the war [mh-7jvePXF4]/ещен_speaker_SPEAKER_09/ещен_speaker_SPEAKER_09_chunk405.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Composure Voice Lines (Disco Elysium) [88ib4sPXm2Q]/Composure Voice Lines (Disco Elysium) [88ib4sPXm2Q]_chunk962.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/بیتا خداداد/633ebac7-ba61-491c-8941-cbc0946f708d/633ebac7-ba61-491c-8941-cbc0946f708d_chunk144.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Inland Empire Voice Lines (Disco Elysium) [qY45lCTGmKc]/Inland Empire Voice Lines (Disco Elysium) [qY45lCTGmKc]_chunk1892.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/مونا فرجاد/b2f3bae3-15fc-43ed-8d2e-2fa8eb65e55e/b2f3bae3-15fc-43ed-8d2e-2fa8eb65e55e_chunk40.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/پویا پورهمدانی/6d7aa3f8-764c-4020-967f-d795940c405c/6d7aa3f8-764c-4020-967f-d795940c405c_chunk595.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/shinichiro_miki/Shinichiro_Miki__02/Shinichiro_Miki__02_chunk1525.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_002/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_002_chunk1519.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/إيفوريا/إيفوريا_chunk1570.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Astrion/BG3 Voice Lines: Astarion (part 2) [yZaGVMN9zGM]/BG3 Voice Lines: Astarion (part 2) [yZaGVMN9zGM]_chunk4878.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/سیاوش رستمی/9334a34f-dc29-4c00-81ec-51ee8f63d45f/9334a34f-dc29-4c00-81ec-51ee8f63d45f_chunk173.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Тодд_P_70_Samye yarkie zvYozdy/Тодд_P_70_Samye yarkie zvYozdy_chunk28.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/طارىء/طارىء_chunk753.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/arisu/arisu_mobamasu/2_arisu__0002_(Vocals)/2_arisu__0002_(Vocals)_chunk31.wav
+/home/ubuntu/respair/jpn/moe/cc948b89/wav/cc948b89_1994.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ظلال الكولوسيوم/ظلال الكولوسيوم_chunk126.mp3
+/home/ubuntu/respair/jpn/moe/b8b5fe66/wav/b8b5fe66_0999.wav
+/home/ubuntu/respair/data_cache/Final_Persian/سحر چوبدار/2d00c6fb-d3d8-4b60-a90f-ffdb45c8e3e5/2d00c6fb-d3d8-4b60-a90f-ffdb45c8e3e5_chunk6.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_1325_P2.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/010/S010_F_0022.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/لأنها استثناء/لأنها استثناء_chunk873.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/رسائل الأحزان في فلسفة الجمال والحب/رسائل الأحزان في فلسفة الجمال والحب_chunk856.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/22khz/22khz/audiobook_rus_dataset_22khz_audio_5777.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Election 2024: Are Identity Politics Holding Us Back? Coleman Hughes vs. Alicia Garza [3G0_0WRV7KM]/Elec_speaker_SPEAKER_00/Elec_speaker_SPEAKER_00_chunk105.wav
+/home/ubuntu/respair/data_cache/Final_Persian/پژمان ابوالقاسمی/dc3e4253-0d7d-43a5-9fc1-ec500f633a21/dc3e4253-0d7d-43a5-9fc1-ec500f633a21_chunk270.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/محیا ساعدی/64717f7d-20a3-4993-8422-c34b24f20815/64717f7d-20a3-4993-8422-c34b24f20815_chunk135.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/ru_youtube/ru_youtube_dataset_audio_1704.wav
+/home/ubuntu/respair/jpn/moe/773a4156/wav/773a4156_2111.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/علي السوري/علي السوري_chunk527.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/سارا فیض/46f33750-c90b-4237-863c-2f557965e990/46f33750-c90b-4237-863c-2f557965e990_chunk144.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/قميص تكويه إمرأتان/قميص تكويه إمرأتان_chunk935.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/КУКОЯКА ‒ ОТВЕТ ИНСТАСАМКЕ, ПОЧЕМУ МЫ ТЕРЯЕМ ДРУЗЕЙ И ПЫТАЕМСЯ ЗАСЛУЖИТЬ ЛЮБОВЬ [6xa3X9ztHXQ]/КУКО_speaker_SPEAKER_02/КУКО_speaker_SPEAKER_02_chunk301.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/Karen/karen_cgss/karen_cgss_card_200463/karen_cgss_voice_200463_1_06.wav
+/home/ubuntu/respair/data_cache/Final_Persian/احمد پوری/6fec69f8-4c30-4061-a516-025768b09302/6fec69f8-4c30-4061-a516-025768b09302_chunk443.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/مریم محبوب/b3d13507-b03f-41c8-ace2-62f47fd9077f/b3d13507-b03f-41c8-ace2-62f47fd9077f_chunk395.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/شيروفوبيا/شيروفوبيا_chunk730.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/نون/نون_chunk1170.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Джек Лондон - Конец сказки/Джек_speaker_SPEAKER_00/Джек_speaker_SPEAKER_00_chunk37.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Патрик Несс - Поступь Хаоса_part_002/Патр_speaker_SPEAKER_00/Патр_speaker_SPEAKER_00_chunk50.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/#Debate: Are Men Finished and Should We Help Them? Richard Reeves vs. Hanna Rosin [FlT5clM4WfA]/#Deb_speaker_SPEAKER_02/#Deb_speaker_SPEAKER_02_chunk18.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sawashiro_miyuki/Sawashiro_Miyuki_03/Sawashiro_Miyuki_03_chunk1018.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Кейт Лаумер - Король города/Кейт_speaker_SPEAKER_02/Кейт_speaker_SPEAKER_02_chunk1248.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/شهيا كفراق/شهيا كفراق_chunk801.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أبابيل/أبابيل_chunk1144.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Костюченко – история современной России ⧸ вДудь [CobxH2gH4pM]/Кост_speaker_SPEAKER_05/Кост_speaker_SPEAKER_05_chunk2022.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/رضا عمرانی/f9c885e4-adf2-41f5-9d1f-e795f9a11fcc/f9c885e4-adf2-41f5-9d1f-e795f9a11fcc_chunk260.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/#1 АВТОТРЁП. Первые отношения, первый поцелуй, первый секс и личные границы [3wGtRRPTJOc]/#1 А_speaker_SPEAKER_02/#1 А_speaker_SPEAKER_02_chunk162.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/فریناز ثریا/d0cc8097-cbae-428f-a050-270596451802/d0cc8097-cbae-428f-a050-270596451802_chunk198.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Анна Джейн_12/Анна Джейн_12_chunk220.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أسيرة الحب/أسيرة الحب_chunk1310.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/irina/dataset_Yakutenko_Irina_10h_44100_16bit_mono/dataset_Yakutenko_Irina_10h_44100_16bit_mono_chunk5336.wav
+/home/ubuntu/respair/data_cache/Final_Persian/مهرانه امروانی/56697d4e-f417-4872-b1c5-b5186a9989be/56697d4e-f417-4872-b1c5-b5186a9989be_chunk24.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Алексей Калугин - Лабиринт 3 - Мир без солнца_part_000/Алек_speaker_SPEAKER_00/Алек_speaker_SPEAKER_00_chunk13.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/fd0ac02c-34f6-4ffc-a9bf-7c1baa2f95f6/fd0ac02c-34f6-4ffc-a9bf-7c1baa2f95f6_chunk721.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/MORGENSHTERN ‒ ЛИЗА, ГАЛЛЮЦИНАЦИИ, ДЕТСКИЕ ТРАВМЫ, ПРИЧИНА РАЗВОДА И СЕКРЕТ УСПЕХА [g1nVX1oPxHE]/MORG_speaker_SPEAKER_11/MORG_speaker_SPEAKER_11_chunk682.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/chiwa_saito/Chiwa_Saito_01/Chiwa_Saito_01_chunk1988_chunks/chunk_1.wav
+/home/ubuntu/respair/data_cache/Final_Persian/لیلا ولی پور/c2887b5a-e92a-46b5-8934-9c537c8d121f/c2887b5a-e92a-46b5-8934-9c537c8d121f_chunk150.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/سمعة شریفة/سمعة شریفة_chunk207.mp3
+/home/ubuntu/respair/jpn/moe/84be23bd/wav/84be23bd_0099.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Харлан Эллисон - Бегство к звёздам/Харл_speaker_SPEAKER_00/Харл_speaker_SPEAKER_00_chunk295.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/نزهة مارشال/نزهة مارشال_chunk918.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Debate: Are DEI Mandates for University Faculties a Bad Idea? [eKay5lcv7Ic]/Deba_speaker_SPEAKER_03/Deba_speaker_SPEAKER_03_chunk37.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Маша Гессен – стыдные вопросы про Америку ⧸ вДудь [Q0oRii7zV9A]/Маша_speaker_SPEAKER_01/Маша_speaker_SPEAKER_01_chunk1001.mp3
+/home/ubuntu/respair/jpn/moe/9febd2ae/wav/9febd2ae_0483.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Брайдер Юрий - Против течения/Брай_speaker_SPEAKER_00/Брай_speaker_SPEAKER_00_chunk414.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/BG3 Voice Lines: Jaheira [r7HxfsGG0zo]/BG3 Voice Lines: Jaheira [r7HxfsGG0zo]_chunk4152.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/BG3 Voice Lines: Minsc [5wO9k1rgRcY]/BG3 Voice Lines: Minsc [5wO9k1rgRcY]_chunk121.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/بهار کاتوزی/eb02fc7d-bdfb-45a0-99ff-62bc5e1c0035/eb02fc7d-bdfb-45a0-99ff-62bc5e1c0035_chunk300.mp3
+/home/ubuntu/respair/jpn/moe/773a4156/wav/773a4156_0261.wav
+/home/ubuntu/respair/jpn/moe/8b6e7173/wav/8b6e7173_0839.wav
+/home/ubuntu/respair/jpn/moe/8b6e7173/wav/8b6e7173_1981.wav
+/home/ubuntu/respair/data_cache/Final_Persian/رضا عمرانی/c77d9712-8a2e-4b82-b096-7dca3886d08c/c77d9712-8a2e-4b82-b096-7dca3886d08c_chunk51.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/003/S003_F_0219.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/НУТРИЦИОЛОГ Мария Кардакова. Кето диета, питание на ГВ и дисбактериоз [790CGV-gcRg]/НУТР_speaker_SPEAKER_02/НУТР_speaker_SPEAKER_02_chunk151.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_002/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_002_chunk3413.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Is Legalizing Marijuana a Mistake? Live Debate [v_U4CxB0MlA]/Is L_speaker_SPEAKER_12/Is L_speaker_SPEAKER_12_chunk129.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/МАРКЕТОЛОГ. Личный бренд, продвижение в соцсетях и инфопродукты [b-nbsSigkKo]/МАРК_speaker_SPEAKER_01/МАРК_speaker_SPEAKER_01_chunk154_chunks/МАРК_speaker_SPEAKER_01_chunk154_chunk_2.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/اعظم حبیبی/8bf4e08c-28b4-4781-8b72-6ce2d8cdb714/8bf4e08c-28b4-4781-8b72-6ce2d8cdb714_chunk30.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/سمعة شریفة/سمعة شریفة_chunk330.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Conceptualization Voice Lines (Disco Elysium) [4cFbMy0snYQ]/Conceptualization Voice Lines (Disco Elysium) [4cFbMy0snYQ]_chunk1161.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/مدينة الحب لا يسكنها العقلاء/مدينة الحب لا يسكنها العقلاء_chunk1150.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/بهرام ابراهیمی/97fff66d-e81e-4e91-8782-9db5118ecbc7/97fff66d-e81e-4e91-8782-9db5118ecbc7_chunk24.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_1636.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_expresso/Ylacombe_Expresso_audio_11326_P3.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сьюзен Коллинз 1 - Голодные игры_part_001/Сьюз_speaker_SPEAKER_00/Сьюз_speaker_SPEAKER_00_chunk583.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Клиффорд Саймак - Кукла судьбы_part_003/Клиф_speaker_SPEAKER_05/Клиф_speaker_SPEAKER_05_chunk1632.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Does the Effective Altruism Movement Get Giving Right? [e1e_TUbRdlA]/Does_speaker_SPEAKER_02/Does_speaker_SPEAKER_02_chunk364.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/sakura_moyu/01/01011190.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/في ديسمبر تنتهي كل الأحلام/في ديسمبر تنتهي كل الأحلام_chunk1404.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/علامات الحب السبعة/علامات الحب السبعة_chunk1041.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_6750.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Гордон - Украина, Россия, Ukraine, Russia (English subs) [in7tepc2shg]/Горд_speaker_SPEAKER_03/Горд_speaker_SPEAKER_03_chunk170.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Astrion/BG3 Voice Lines: Astarion (part 1) [THs2r-xB-Rw]_part_000/BG3 Voice Lines: Astarion (part 1) [THs2r-xB-Rw]_part_000_chunk277.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أرغبه رجلا يا أبي/أرغبه رجلا يا أبي_chunk302.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/nagi/nagi_cgss/nagi_chara_309/nagi_voice_309_4_13.wav
+/home/ubuntu/respair/data_cache/Final_Persian/شهرزاد عالی/4c413c0b-ed80-4d4a-95c4-21147a612d8a/4c413c0b-ed80-4d4a-95c4-21147a612d8a_chunk56.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Этногенез - Маруся 3_part_001/Этно_speaker_SPEAKER_05/Этно_speaker_SPEAKER_05_chunk295.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/b842c75e-7251-441c-852d-fc8cc5c20558/b842c75e-7251-441c-852d-fc8cc5c20558_chunk282.mp3
+/home/ubuntu/respair/jpn/moe/b8b5fe66/wav/b8b5fe66_1892.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_expresso/Ylacombe_Expresso_audio_8528_P3.wav
+/home/ubuntu/respair/data_cache/Final_Persian/اطهر کلانتری/e9d04f1c-5687-422e-8ab7-4db6e95fd8b4/e9d04f1c-5687-422e-8ab7-4db6e95fd8b4_chunk176.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/013/S013_A_0244.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/دفتر الغربة/دفتر الغربة_chunk920.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Сергей Супонев - друг всех детей ⧸ вДудь [ckyW08MpmHs]/Серг_speaker_SPEAKER_07/Серг_speaker_SPEAKER_07_chunk84.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Karlach/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_002/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_002_chunk2517.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_1754.wav
+/home/ubuntu/respair/data_cache/Final_Persian/نازنین آذرسا/3b014b6b-29ee-4ae1-846e-f889cdd2206b/3b014b6b-29ee-4ae1-846e-f889cdd2206b_chunk120.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Владимир Яценко - Старик и дети/Влад_speaker_SPEAKER_00/Влад_speaker_SPEAKER_00_chunk278.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/لأنها استثناء/لأنها استثناء_chunk347.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/frederica/fredrica_cgss/fredrica_card_100747/fredrica_voice_100747_1_11.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/يا سلمى أنا الآن وحيد /يا سلمى أنا الآن وحيد _chunk798.mp3
+/home/ubuntu/respair/jpn/moe/6d565f54/wav/6d565f54_1248.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/BG3 Voice Lines: Minsc [5wO9k1rgRcY]/BG3 Voice Lines: Minsc [5wO9k1rgRcY]_chunk3356.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/اشکان عقیلی پور/ef422f57-99b8-40cd-8e22-553025f09d3d/ef422f57-99b8-40cd-8e22-553025f09d3d_chunk131.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Джек Андерсон - Игра в лево-право/Джек_speaker_SPEAKER_04/Джек_speaker_SPEAKER_04_chunk4173.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/الميلاد/الميلاد_chunk595.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/Taiga/Taiga_rus_dataset_audio_5866.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сьюзен Коллинз 1 - Голодные игры_part_001/Сьюз_speaker_SPEAKER_00/Сьюз_speaker_SPEAKER_00_chunk1319.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sawashiro_miyuki/Sawashiro_Miyuki_03/Sawashiro_Miyuki_03_chunk752.wav
+/home/ubuntu/respair/data_cache/Final_Persian/بهراد رضازاده/5e04ac82-5ecf-4af5-976f-7fa319e5926d/5e04ac82-5ecf-4af5-976f-7fa319e5926d_chunk187.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/امیررضا علی زاده/d17ab7bd-ebb2-4442-9730-e84726cb90a4/d17ab7bd-ebb2-4442-9730-e84726cb90a4_chunk88.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ذئاب لا تغفر/ذئاب لا تغفر_chunk1130.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Karlach/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_001/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_001_chunk2549.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/عشقني عفريت من الجن/عشقني عفريت من الجن_chunk303.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Authority Voice Lines (Disco Elysium) [lsWdvLZ9Wac]/Authority Voice Lines (Disco Elysium) [lsWdvLZ9Wac]_chunk363.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/مرضیه رحماندوست/3f460bef-ebe3-4407-89ec-61d7956b8d4c/3f460bef-ebe3-4407-89ec-61d7956b8d4c_chunk204.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/فوضى الحواس/فوضى الحواس_chunk1030.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/كلب عائلة باسكرفيل/كلب عائلة باسكرفيل_chunk1217.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/АЛЕКСАНДР ПАНЧИН. Агностики, соционика и вакцины от коронавируса [PeS8Bh9vizI]/АЛЕК_speaker_SPEAKER_00/АЛЕК_speaker_SPEAKER_00_chunk452.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/a37f3bf0-3061-4cd7-a2df-446e507439dc/a37f3bf0-3061-4cd7-a2df-446e507439dc_chunk460.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/BG3 Voice Lines: Jaheira [r7HxfsGG0zo]/BG3 Voice Lines: Jaheira [r7HxfsGG0zo]_chunk6181.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sawashiro_miyuki/Sawashiro_Miyuki_02/Sawashiro_Miyuki_02_chunk489.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/السكابندو وقصص أخرى/السكابندو وقصص أخرى_chunk225.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/حامد فعال/863f0f5f-7f71-4d7a-b286-0587b223d7bf/863f0f5f-7f71-4d7a-b286-0587b223d7bf_chunk214.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/شوق الدرويش/شوق الدرويش_chunk1194.mp3
+/home/ubuntu/respair/jpn/moe/ee093a4f/wav/ee093a4f_1537.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ЛОГОПЕД. Развитие речи, постановка звуков, массаж и подрезание уздечки [ra3U7s-VZzI]/ЛОГО_speaker_SPEAKER_00/ЛОГО_speaker_SPEAKER_00_chunk921.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/کاوه فولادی نسب/4152c4fa-1b13-4854-8e48-fb7914bdea8f/4152c4fa-1b13-4854-8e48-fb7914bdea8f_chunk100.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/سيدة في خدمتك/سيدة في خدمتك_chunk850.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/فاطمه کمالی/180ad9ff-c82a-45ff-9b34-8d62c4d142fb/180ad9ff-c82a-45ff-9b34-8d62c4d142fb_chunk343.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_1060_P3.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/Syuuko/Syuuko_Events_and_Card/Card_Commyuu/Work_Comyu/Work_Comyu_chunk55.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Джейн_Анна_15/Джейн_Анна_15_chunk107.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/حسین تسلیمی/f723254c-afd5-45b9-bb2d-f6e24f1d0a69/f723254c-afd5-45b9-bb2d-f6e24f1d0a69_chunk416.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/لاله اکبری/ff57cb36-b8db-4039-bd12-7c37e6011f80/ff57cb36-b8db-4039-bd12-7c37e6011f80_chunk74.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Morell, the Cryptozoologist Voice Lines (Disco Elysium) [O-ESHKnBNGo]/Morell, the Cryptozoologist Voice Lines (Disco Elysium) [O-ESHKnBNGo]_chunk355.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/بئر الحرمان/بئر الحرمان_chunk1537.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/MORGENSHTERN ‒ ЛИЗА, ГАЛЛЮЦИНАЦИИ, ДЕТСКИЕ ТРАВМЫ, ПРИЧИНА РАЗВОДА И СЕКРЕТ УСПЕХА [g1nVX1oPxHE]/MORG_speaker_SPEAKER_02/MORG_speaker_SPEAKER_02_chunk1023.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/إحدى عشرة دقيقة/إحدى عشرة دقيقة_chunk451.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/قدري أنت/قدري أنت_chunk36.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Джейн_Анна_04/Джейн_Анна_04_chunk118.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/shinichiro_miki/Shinichiro_Miki__01/Shinichiro_Miki__01_chunk270.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Лошак – как оставаться журналистом ⧸ Loshak – how to stay a journalist [PWt27h_scaY]/Лоша_speaker_SPEAKER_04/Лоша_speaker_SPEAKER_04_chunk299.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Debate: Is Wokeness Killing Comedy? Live - Lou Perez vs. Michael Ian Black [J4Vb53s4I0A]/Deba_speaker_SPEAKER_06/Deba_speaker_SPEAKER_06_chunk23.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أنشودة المقهى الحزين/أنشودة المقهى الحزين_chunk204.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/تارا تیمورزاده/ac11ca31-2fc5-498c-8c94-5bcd49a76fdb/ac11ca31-2fc5-498c-8c94-5bcd49a76fdb_chunk410.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Анна Джейн_21/Анна Джейн_21_chunk358.mp3
+/home/ubuntu/respair/jpn/moe/ad28b91b/wav/ad28b91b_2228.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/سوف أحكي عنك /سوف أحكي عنك _chunk297.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/whispering_chunks/ASMR - JANUARY 2023 - Monthly Favourites [9rv-PFYRAho]/ASMR - JANUARY 2023 - Monthly Favourites [9rv-PFYRAho]_chunk280.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Evrart Claire Voice Lines (Disco Elysium) [3fyCrwXUM3c]/Evrart Claire Voice Lines (Disco Elysium) [3fyCrwXUM3c]_chunk1318.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Karlach/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_001/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_001_chunk2221.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/004/S004_E_0006.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_001/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_001_chunk16.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Should Courts, Not Campuses, Decide Sexual Assault Cases? [XdH7X9i5NpM]/Shou_speaker_SPEAKER_05/Shou_speaker_SPEAKER_05_chunk63_chunks/Shou_speaker_SPEAKER_05_chunk63_chunk_1.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/آرش راسخ/af8053f2-6a13-4c77-a2a4-c6b01997d6f6/af8053f2-6a13-4c77-a2a4-c6b01997d6f6_chunk509.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Алексей Калугин - Подмененный 3 - Осколки реальности_part_000/Алек_speaker_SPEAKER_03/Алек_speaker_SPEAKER_03_chunk18.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/shadow_heart/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_000/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_000_chunk2321.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/نازنین آذرسا/50c24690-436c-475c-93ec-ded918f2f09e/50c24690-436c-475c-93ec-ded918f2f09e_chunk252.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ГЕШТАЛЬТ ТЕРАПИЯ и осознанность в эмоциях. Как незакрытый гештальт переходит в невроз [_EXkjzgfCi8]/ГЕШТ_speaker_SPEAKER_02/ГЕШТ_speaker_SPEAKER_02_chunk42.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أوراق الورد/أوراق الورد_chunk440.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/СЕМЕЙНЫЙ ПСИХОЛОГ. Развод, абьюз и феминизм. Сохранять ли брак ради ребёнка? [AeKXCnoEKSA]/СЕМЕ_speaker_SPEAKER_04/СЕМЕ_speaker_SPEAKER_04_chunk760.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/ЯНЧИК ‒ ГОЛОСА В ГОЛОВЕ, СТРАХ ОШИБОК, ОКР И ВЫЖИВАНИЕ В ЛЕСУ [czFc_d2tp8I]/ЯНЧИ_speaker_SPEAKER_06/ЯНЧИ_speaker_SPEAKER_06_chunk993.mp3
+/home/ubuntu/respair/jpn/moe/1a5a3db8/wav/1a5a3db8_0215.wav
+/home/ubuntu/respair/jpn/moe/6d565f54/wav/6d565f54_1467.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/shiburin/shiburin_cgss/shiburin_card_200071/shiburin_voice_200071_2_11.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сьюзен Коллинз 1 - Голодные игры_part_002/Сьюз_speaker_SPEAKER_00/Сьюз_speaker_SPEAKER_00_chunk717.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/إيفوريا/إيفوريا_chunk1102.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Laezel/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_002/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_002_chunk37.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Global Leaders Debate Money, Truth, and Power [ygW1PsuaipY]/Glob_speaker_SPEAKER_07/Glob_speaker_SPEAKER_07_chunk34.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/حسناء في المستنقع/حسناء في المستنقع_chunk709.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/صياد النسيم/صياد النسيم_chunk1384.mp3
+/home/ubuntu/respair/jpn/moe/5d68aedf/wav/5d68aedf_1891.wav
+/home/ubuntu/respair/data_cache/Final_Persian/اشکان عقیلی پور/642bad82-6b0b-474d-b989-b0508cb76adf/642bad82-6b0b-474d-b989-b0508cb76adf_chunk83.mp3
+/home/ubuntu/respair/jpn/moe/5d68aedf/wav/5d68aedf_1025.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/شوق الدرويش/شوق الدرويش_chunk504.mp3
+/home/ubuntu/respair/jpn/moe/df6c208e/wav/df6c208e_1504.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/العذراء والشعر الأبيض/العذراء والشعر الأبيض_chunk186.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/گلاره عباسی/bb4764d9-a6bb-44e5-b80d-05033943ea22/bb4764d9-a6bb-44e5-b80d-05033943ea22_chunk25.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Astrion/BG3 Voice Lines: Astarion (part 1) [THs2r-xB-Rw]_part_001/BG3 Voice Lines: Astarion (part 1) [THs2r-xB-Rw]_part_001_chunk15.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/أيام فاتت/أيام فاتت_chunk669.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/گلچهر دامغانی/ed978bcb-b73c-43b9-8feb-eedb63c9a006/ed978bcb-b73c-43b9-8feb-eedb63c9a006_chunk951.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/مرضیه رحماندوست/a0e913cd-32e4-4410-a613-196de8204050/a0e913cd-32e4-4410-a613-196de8204050_chunk12.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/001/S001_F_0129.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/shadow_heart/BG3 Voice Lines: Shadowheart (part 2) [PoureCLZNxg]/BG3 Voice Lines: Shadowheart (part 2) [PoureCLZNxg]_chunk2395.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ЖИЗНЬ С ШИЗОФРЕНИЕЙ-2. Aline in Wonderland. Принудительная госпитализация [iTysDG98Tw8]/ЖИЗН_speaker_SPEAKER_01/ЖИЗН_speaker_SPEAKER_01_chunk699.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/شقایق خاکی پور/d8ade414-0f5a-4333-a9f5-89895d617e4a/d8ade414-0f5a-4333-a9f5-89895d617e4a_chunk631.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ДОКТОР УТИН. Кофе и сердце, что такое инфаркт, как правильно измерять давление [uXgDhDdDW2g]/ДОКТ_speaker_SPEAKER_04/ДОКТ_speaker_SPEAKER_04_chunk129.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ПОДРОСТОК о СЕКСЕ. Надо ли родителям разговаривать с детьми? Проблемы в школах [Wo3ZqS01R2w]/ПОДР_speaker_SPEAKER_04/ПОДР_speaker_SPEAKER_04_chunk121.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/سايكو 2/سايكو 2_chunk494.mp3
+/home/ubuntu/respair/jpn/moe/9febd2ae/wav/9febd2ae_0753.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/horie_yui/Horie_Yui_02/Horie_Yui_02_chunk184.wav
+/home/ubuntu/respair/jpn/moe/449d5a0a/wav/449d5a0a_0949.wav
+/home/ubuntu/respair/jpn/moe/95c3bdd8/wav/95c3bdd8_0871.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/لكنك لن تعرفي/لكنك لن تعرفي_chunk650.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Степанова_41/Степанова_41_chunk1.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/شیكولاتة بیضاء/شیكولاتة بیضاء_chunk402.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/Kanade/Kanade_voice_home_shinaido_room/kanade_card_200580/kanade_voice_200580_6_05.wav
+/home/ubuntu/respair/data_cache/Final_Persian/نازنین آذرسا/b3aa3650-ab3d-4fef-8c43-76abac9ee044/b3aa3650-ab3d-4fef-8c43-76abac9ee044_chunk148.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ДОКТОР УТИН. Кофе и сердце, что такое инфаркт, как правильно измерять давление [uXgDhDdDW2g]/ДОКТ_speaker_SPEAKER_02/ДОКТ_speaker_SPEAKER_02_chunk451.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Степанова_40/Степанова_40_chunk43.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/مصطفی هرآیینی/e4cc5f9b-523f-49d4-88a3-e432c19fdadb/e4cc5f9b-523f-49d4-88a3-e432c19fdadb_chunk105.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Should Courts, Not Campuses, Decide Sexual Assault Cases? [XdH7X9i5NpM]/Shou_speaker_SPEAKER_05/Shou_speaker_SPEAKER_05_chunk58.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/sakura_moyu/05/05000390.wav
+/home/ubuntu/respair/jpn/moe/1cc3c6c0/wav/1cc3c6c0_0223.wav
+/home/ubuntu/respair/jpn/moe/cbe5080e/wav/cbe5080e_0793.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/سيدة في خدمتك/سيدة في خدمتك_chunk999.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Agree-to-Disagree: Is True Love a Myth? [78PXedWyBAA]/Agre_speaker_SPEAKER_02/Agre_speaker_SPEAKER_02_chunk27.wav
+/home/ubuntu/respair/jpn/moe/bc778ddb/wav/bc778ddb_0956.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ما تخبئه لنا النجوم/ما تخبئه لنا النجوم_chunk570.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/شهین دخت نجف زاده/d65e3143-dd32-4720-816f-7cbd92fd8bac/d65e3143-dd32-4720-816f-7cbd92fd8bac_chunk98.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/راضیه هاشمی/4d542086-331f-4f79-902f-9e27e5e5217d/4d542086-331f-4f79-902f-9e27e5e5217d_chunk439.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/شیكولاتة بیضاء/شیكولاتة بیضاء_chunk471.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Composure Voice Lines (Disco Elysium) [88ib4sPXm2Q]/Composure Voice Lines (Disco Elysium) [88ib4sPXm2Q]_chunk346.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/22khz/22khz/audiobook_rus_dataset_22khz_audio_12940.wav
+/home/ubuntu/respair/data_cache/Final_Persian/نرگس رحیمیان/acf1cb92-a813-4580-bf54-1544bd24410d/acf1cb92-a813-4580-bf54-1544bd24410d_chunk878.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/minami/minami_cgss/minami_card_201307/minami_voice_201307_2_02.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/بيت ح دد/بيت ح دد_chunk821.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/shinichiro_miki/Shinichiro_Miki_03/Shinichiro_Miki_03_chunk2211.wav
+/home/ubuntu/respair/data_cache/Final_Persian/مهبد قناعت پیشه/8542aeb6-372a-4dfe-970c-c8376bf2a8b6/8542aeb6-372a-4dfe-970c-c8376bf2a8b6_chunk43.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ردني إليك/ردني إليك_chunk365.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Shivers Voice Lines (Disco Elysium) [PgHIM3M1Al4]/Shivers Voice Lines (Disco Elysium) [PgHIM3M1Al4]_chunk1312.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/تقتلني أو أكتبها/تقتلني أو أكتبها_chunk1142.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/004/S004_C_0070.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/shadow_heart/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_001/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_001_chunk894.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/صفا آقاجانی/59968192-7770-4c9f-83ac-83ab686d5649/59968192-7770-4c9f-83ac-83ab686d5649_chunk111.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/مارال نوحی/98447c9c-684c-4043-8f67-95c2d8977053/98447c9c-684c-4043-8f67-95c2d8977053_chunk176.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/پژمان ابوالقاسمی/08226e7f-f4dc-4ba7-a71a-fd526e8572e2/08226e7f-f4dc-4ba7-a71a-fd526e8572e2_chunk717.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Debate: Is the FDA Too Cautious? [FSkyfsTL_xw]/Deba_speaker_SPEAKER_03/Deba_speaker_SPEAKER_03_chunk90.wav
+/home/ubuntu/respair/data_cache/Final_Persian/یاسر دعاگو/c2baaef2-faaa-4110-903e-1c5e422a13b1/c2baaef2-faaa-4110-903e-1c5e422a13b1_chunk658.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/باريس بلا ايفل/باريس بلا ايفل_chunk592.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сергей Павин - Лавка песочных часов/Серг_speaker_SPEAKER_00/Серг_speaker_SPEAKER_00_chunk139.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/Taiga/Taiga_rus_dataset_audio_7198.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/بلاد الطاخ طاخ/بلاد الطاخ طاخ_chunk1234.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/شقایق خاکی پور/d528184f-99d0-4136-99a1-3eab3c4a4166/d528184f-99d0-4136-99a1-3eab3c4a4166_chunk161.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Степанова_16/Степанова_16_chunk39.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/لن أنسى/لن أنسى_chunk540.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/ЯНЧИК ‒ ГОЛОСА В ГОЛОВЕ, СТРАХ ОШИБОК, ОКР И ВЫЖИВАНИЕ В ЛЕСУ [czFc_d2tp8I]/ЯНЧИ_speaker_SPEAKER_07/ЯНЧИ_speaker_SPEAKER_07_chunk94.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/sakura_moyu/01/01014470.wav
+/home/ubuntu/respair/data_cache/Final_Persian/بابک مینایی/a519dfd7-140c-4537-bd63-8b71bb70845b/a519dfd7-140c-4537-bd63-8b71bb70845b_chunk114.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/رقصة المرمر/رقصة المرمر_chunk1366.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/سأقذف نفسي أمامك/سأقذف نفسي أمامك_chunk722.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/Taiga/Taiga_rus_dataset_audio_3655.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/خبايا العرب/خبايا العرب_chunk1176.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Клиффорд Саймак - Кукла судьбы_part_003/Клиф_speaker_SPEAKER_05/Клиф_speaker_SPEAKER_05_chunk1177.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/سألقاك هناك/سألقاك هناك_chunk1101.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Glava_03/Glava_03_chunk562.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_6555.wav
+/home/ubuntu/respair/data_cache/Final_Persian/تارا تیمورزاده/ad22083b-b26c-4b6e-9908-3faefdbc298f/ad22083b-b26c-4b6e-9908-3faefdbc298f_chunk364.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/ranko/ranko_cgss/ranko_card_200097/ranko_voice_200097_1_09.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/ولد قليل الأدب/ولد قليل الأدب_chunk1820.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Laezel/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_003/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_003_chunk344.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/shadow_heart/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_001/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_001_chunk631.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/محمد امرایی/0f6832d4-b027-4fba-bcd6-5ef85242bd99/0f6832d4-b027-4fba-bcd6-5ef85242bd99_chunk230.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أنا سنية و أنت شيعي/أنا سنية و أنت شيعي_chunk641.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ПЕРИНАТАЛЬНЫЙ ПСИХОЛОГ. Про тикающие часики, ЭКО и аборты. Как пережить потерю ребенка [qhxe6nZ4p4E]/ПЕРИ_speaker_SPEAKER_02/ПЕРИ_speaker_SPEAKER_02_chunk844.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/22khz/22khz/audiobook_rus_dataset_22khz_audio_7589.wav
+/home/ubuntu/respair/data_cache/Final_Persian/مهدی صفری/48094592-83d7-4ced-a375-29a7f1a348fe/48094592-83d7-4ced-a375-29a7f1a348fe_chunk260.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/الیاس گرجی/9e40cee4-7270-44f9-8264-052106027a01/9e40cee4-7270-44f9-8264-052106027a01_chunk81.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Лошак – как оставаться журналистом ⧸ Loshak – how to stay a journalist [PWt27h_scaY]/Лоша_speaker_SPEAKER_03/Лоша_speaker_SPEAKER_03_chunk238.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Jean Vicquemare Voice Lines (Disco Elysium) [z_7OvqdkmKI]/Jean Vicquemare Voice Lines (Disco Elysium) [z_7OvqdkmKI]_chunk54.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/اللعبة/اللعبة_chunk520.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/السكابندو وقصص أخرى/السكابندو وقصص أخرى_chunk1226.mp3
+/home/ubuntu/respair/jpn/moe/46d6bf83/wav/46d6bf83_1389.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/قصص الكتب الخمسة/قصص الكتب الخمسة_chunk253.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/یاسر دعاگو/33a8dea0-1d4c-4e08-9097-3f3906a7b488/33a8dea0-1d4c-4e08-9097-3f3906a7b488_chunk370.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sawashiro_miyuki/Sawashiro_Miyuki_02/Sawashiro_Miyuki_02_chunk1671.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Karlach/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_001/BG3 Voice Lines: Karlach [Enn8zDfS6Es]_part_001_chunk211.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/فاطمه کمالی/9046daac-a4ae-4879-b7a0-1f99a8ea757f/9046daac-a4ae-4879-b7a0-1f99a8ea757f_chunk11.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/إفطار عند تيفاني/إفطار عند تيفاني_chunk1416.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Neha, the Novelty Dicemaker Voice Lines (Disco Elysium) [bSZCBFjQOfo]/Neha, the Novelty Dicemaker Voice Lines (Disco Elysium) [bSZCBFjQOfo]_chunk209.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/saori/merged_vocals_chunk678.wav
+/home/ubuntu/respair/jpn/moe/00163dc9/wav/00163dc9_1552.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Кеннет Балмер - Страна которой нет на карте/Кенн_speaker_SPEAKER_01/Кенн_speaker_SPEAKER_01_chunk3664.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/آزاده رادمهر/f431b0f5-133a-406f-b8a6-cb42c507ca5d/f431b0f5-133a-406f-b8a6-cb42c507ca5d_chunk134.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Пол Бэттейджер - Ледяной ад/Пол _speaker_SPEAKER_01/Пол _speaker_SPEAKER_01_chunk564.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/راما قویدل/944535f5-bf56-441d-805a-a48f6fddfc60/944535f5-bf56-441d-805a-a48f6fddfc60_chunk95.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/رضا عمرانی/f2eb7ae4-7e24-45ba-8912-abe6273107bb/f2eb7ae4-7e24-45ba-8912-abe6273107bb_chunk430.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Unresolved: The Iran Threat [rdRcyieKBVE]/Unre_speaker_SPEAKER_08/Unre_speaker_SPEAKER_08_chunk156.wav
+/home/ubuntu/respair/data_cache/Final_Persian/مرضیه رحماندوست/981c9d9c-0da8-449c-b826-5ba6ea45311d/981c9d9c-0da8-449c-b826-5ba6ea45311d_chunk135.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_15398.wav
+/home/ubuntu/respair/data_cache/Final_Persian/یاسین ولی نژاد/a6df8917-be49-4e7f-a52a-3e69b65fdeb6/a6df8917-be49-4e7f-a52a-3e69b65fdeb6_chunk14_chunks/a6df8917-be49-4e7f-a52a-3e69b65fdeb6_chunk14_chunk_3.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/ليثيوم/ليثيوم_chunk735.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Анна Джейн_19/Анна Джейн_19_chunk8.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Laezel/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_002/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_002_chunk1108.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/MARCELO MIRACLES - 200.000.000 НА ОДЕЖДЕ, МАГАЗИН В ПАРИЖЕ, ПРОДАЖА ТРАВЫ И МИФЫ О ПСИХОЛОГАХ [C9d8v1bv8Sw]/MARC_speaker_SPEAKER_09/MARC_speaker_SPEAKER_09_chunk245.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Брэндон Сандерсон - Стоп-кадр/Брэн_speaker_SPEAKER_01/Брэн_speaker_SPEAKER_01_chunk353.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/یاشار ابراهیمی/e7181295-35ad-4953-a64d-de56a2cdd063/e7181295-35ad-4953-a64d-de56a2cdd063_chunk171.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_17443.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/بره الدنيا/بره الدنيا_chunk386.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/shadow_heart/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_001/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_001_chunk837.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/رفقاء الليل/رفقاء الليل_chunk494.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/مكتوب/مكتوب_chunk468.mp3
+/home/ubuntu/respair/jpn/moe/18460462/wav/18460462_2321.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ولك العودة/ولك العودة_chunk825.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Этногенез - Маруся 3_part_003/Этно_speaker_SPEAKER_02/Этно_speaker_SPEAKER_02_chunk1223.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/malady/Baldur's Gate 3 Voice Lines: The Narrator [BcdezgUEnLM]/Baldur's Gate 3 Voice Lines: The Narrator [BcdezgUEnLM]_chunk868.mp3
+/home/ubuntu/respair/jpn/moe/8b6e7173/wav/8b6e7173_0246.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/إليك قلبي/إليك قلبي_chunk818.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Артем Каменистый - Практикантка_part_003/Арте_speaker_SPEAKER_00/Арте_speaker_SPEAKER_00_chunk0.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/22khz/22khz/audiobook_rus_dataset_22khz_audio_982.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/روح/روح_chunk867.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/پویا پورهمدانی/01d03ce5-d53e-4dad-bf21-78afca9f0183/01d03ce5-d53e-4dad-bf21-78afca9f0183_chunk361.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/fumika/fumika_cgss/fumika_card_200282/fumika_voice_200282_2_09.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/مطلوب حبيب/مطلوب حبيب_chunk937.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Laezel/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_000/BG3 Voice Lines: Lae'zel [q-Z-9hfSirg]_part_000_chunk1623.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/shadow_heart/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_002/BG3 Voice Lines: Shadowheart (part 1) [u95hd47w8pM]_part_002_chunk616.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/أكوان/أكوان_chunk12.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_16020.wav
+/home/ubuntu/respair/data_cache/Final_Persian/ونوس صفری/5e65388c-dd61-4f0c-a7c2-af511dc4b401/5e65388c-dd61-4f0c-a7c2-af511dc4b401_chunk0_chunks/5e65388c-dd61-4f0c-a7c2-af511dc4b401_chunk0_chunk_3.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/شهرزاد عالی/c332cbcf-3eee-4f81-a168-2f3c032dfa4f/c332cbcf-3eee-4f81-a168-2f3c032dfa4f_chunk142.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sakurai_takahiro/Sakurai_Takahiro_02/Sakurai_Takahiro_02_chunk1544.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Debate: Will AI Kill the Future of the Creative Arts? Jonathan Taplin vs. Rebecca Fiebrink [J5cz-v0j3D0]/Deba_speaker_SPEAKER_00/Deba_speaker_SPEAKER_00_chunk90.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/usamin/usamin_mobamas/usamin_mobamasu_0007/usamin_mobamasu_0007_chunk15.wav
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/b240e95c-208a-4bb9-8e14-ef924efa9047/b240e95c-208a-4bb9-8e14-ef924efa9047_chunk431.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/009/S009_A_0055.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Клиффорд Саймак - Кукла судьбы_part_002/Клиф_speaker_SPEAKER_04/Клиф_speaker_SPEAKER_04_chunk69.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/МАРКЕТОЛОГ. Личный бренд, продвижение в соцсетях и инфопродукты [b-nbsSigkKo]/МАРК_speaker_SPEAKER_01/МАРК_speaker_SPEAKER_01_chunk194.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_8349.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_expresso/Ylacombe_Expresso_audio_10948_P3.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/shinichiro_miki/Shinichiro_Miki_03/Shinichiro_Miki_03_chunk2008.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Брайдер Юрий - Против течения/Брай_speaker_SPEAKER_00/Брай_speaker_SPEAKER_00_chunk92.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/Syuuko/Syuuko_Mobamas/Syuko Voice/【モバマス】[シンデレラドリーム]塩見周子【ボイス集】 - Niconico Video/【モバマス】[シンデレラドリーム]塩見周子【ボイス集】 - Niconico Video_chunk12.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/مجدولين/مجدولين_chunk361.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أهواك/أهواك_chunk1278.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Cuno Voice Lines (Disco Elysium) [GI3nSMhAHgU]/Cuno Voice Lines (Disco Elysium) [GI3nSMhAHgU]_chunk235.mp3
+/home/ubuntu/respair/jpn/moe/bbd90363/wav/bbd90363_1058.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_17836.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_003/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_003_chunk3147.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Анна Джейн_18/Анна Джейн_18_chunk25.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/horie_yui/Horie_Yui_03/Horie_Yui_03_chunk582.wav
+/home/ubuntu/respair/jpn/moe/46d6bf83/wav/46d6bf83_0757.wav
+/home/ubuntu/respair/jpn/moe/8b6e7173/wav/8b6e7173_0888.wav
+/home/ubuntu/respair/data_cache/Final_Persian/مریم محبوب/7232519c-7ce3-45fb-9d4b-01fd9ddaef6e/7232519c-7ce3-45fb-9d4b-01fd9ddaef6e_chunk817.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Алексей Калугин - Лабиринт 3 - Мир без солнца_part_000/Алек_speaker_SPEAKER_00/Алек_speaker_SPEAKER_00_chunk67.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Logic Voice Lines (Disco Elysium) [01lD5K990NY]/Logic Voice Lines (Disco Elysium) [01lD5K990NY]_chunk675.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/002/S002_A_0633.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/الكرسي الهزاز/الكرسي الهزاز_chunk1063.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Debate: We Should Expand the Supreme Court [eb6o-mTZm4o]/Deba_speaker_SPEAKER_02/Deba_speaker_SPEAKER_02_chunk160.wav
+/home/ubuntu/respair/data_cache/Final_Persian/مهرداد محمدپور/7b29e20a-1fda-4c2c-897d-31024969a2ef/7b29e20a-1fda-4c2c-897d-31024969a2ef_chunk23.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/Kanade/Kanade_Events_and_Card/Kanade_Events/Monochrome_lily/Monochrome_lily_chunk245.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_003/Kim Kitsuragi Voice Lines (Disco Elyisum) [c5qHI57fkFE]_part_003_chunk180.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/الخيط الرفيع/الخيط الرفيع_chunk1325.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Egg Head Voice Lines (Disco Elysium) [jpEuRSw-R0U]/Egg Head Voice Lines (Disco Elysium) [jpEuRSw-R0U]_chunk76.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Open to Debate Mock Trial: Is Trump Guilty in the January 6th Case? Lanny Davis vs Sara Azari [rN5klE6Ul_A]/Open_speaker_SPEAKER_00/Open_speaker_SPEAKER_00_chunk116.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Джек Андерсон - Игра в лево-право/Джек_speaker_SPEAKER_04/Джек_speaker_SPEAKER_04_chunk1898.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_1783.wav
+/home/ubuntu/respair/jpn/moe/00163dc9/wav/00163dc9_0264.wav
+/home/ubuntu/respair/data_cache/Final_Persian/رضا عمرانی/6809b0ed-aa4f-4ff2-a3e1-78825d4a7146/6809b0ed-aa4f-4ff2-a3e1-78825d4a7146_chunk325.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сарко Де Рази - Украденный свет/Сарк_speaker_SPEAKER_00/Сарк_speaker_SPEAKER_00_chunk405.mp3
+/home/ubuntu/respair/jpn/moe/95c3bdd8/wav/95c3bdd8_2219.wav
+/home/ubuntu/respair/data_cache/Final_Persian/تینا میرکریمی/059a1c87-6624-4f75-98c8-f9f92c83fd9c/059a1c87-6624-4f75-98c8-f9f92c83fd9c_chunk238.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Парфенов - о преемнике, Серебренникове и мате ⧸ вДудь [t6i4ElZV1K0]/Парф_speaker_SPEAKER_04/Парф_speaker_SPEAKER_04_chunk83.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/عشقني عفريت من الجن/عشقني عفريت من الجن_chunk1010.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/محسن زرآبادی پور/e1013cd9-b7fc-4373-9060-404b3fff1e32/e1013cd9-b7fc-4373-9060-404b3fff1e32_chunk161.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/احسان چریکی/1cd3f6cb-2936-42dc-a5f9-f90591f00e96/1cd3f6cb-2936-42dc-a5f9-f90591f00e96_chunk85.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_000/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_000_chunk1669.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/shinichiro_miki/Shinichiro_Miki__01/Shinichiro_Miki__01_chunk863.wav
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/b842c75e-7251-441c-852d-fc8cc5c20558/b842c75e-7251-441c-852d-fc8cc5c20558_chunk800.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Этногенез - Маруся 3_part_003/Этно_speaker_SPEAKER_02/Этно_speaker_SPEAKER_02_chunk1007.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/ونوس صفری/ddcbcb63-4285-44d9-b305-700e86ae98a6/ddcbcb63-4285-44d9-b305-700e86ae98a6_chunk14_chunks/ddcbcb63-4285-44d9-b305-700e86ae98a6_chunk14_chunk_5.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Should Congress Stop Funding the War in Ukraine? [rdtCyiKHtqE]/Shou_speaker_SPEAKER_01/Shou_speaker_SPEAKER_01_chunk12.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/МРАКОБЕСИЕ В ПСИХИАТРИИ. Ипохондрия, расстановки по Хеллингеру и холотропное дыхание [T-oKbZtgRN8]/МРАК_speaker_SPEAKER_01/МРАК_speaker_SPEAKER_01_chunk790.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/frame_turner/MORGENSHTERN ‒ ЛИЗА, ГАЛЛЮЦИНАЦИИ, ДЕТСКИЕ ТРАВМЫ, ПРИЧИНА РАЗВОДА И СЕКРЕТ УСПЕХА [g1nVX1oPxHE]/MORG_speaker_SPEAKER_02/MORG_speaker_SPEAKER_02_chunk2015.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ذاكرة الجسد/ذاكرة الجسد_chunk526.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/imas_split/mio/mio_honda_cgss/mio_honda_card_301000/mio_honda_voice_301000_1_12.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/ФАРМАЦЕВТ. Что подмешивают в БАДы? Вакцина от коронавируса [IYPGDU8Am1I]/ФАРМ_speaker_SPEAKER_01/ФАРМ_speaker_SPEAKER_01_chunk21.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/Жизнь с БИПОЛЯРНЫМ РАССТРОЙСТВОМ. Депрессии, гипомании и смешанные фазы [aP1S8MK7lrI]/Жизн_speaker_SPEAKER_03/Жизн_speaker_SPEAKER_03_chunk199.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_183.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_810_P3.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_expresso/Ylacombe_Expresso_audio_1731_P3.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Esprit de Corps Voice Lines (Disco Elysium) [Dnzg0hMiPls]/Esprit de Corps Voice Lines (Disco Elysium) [Dnzg0hMiPls]_chunk439.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Анна Джейн_35/Анна Джейн_35_chunk203.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Fast Fashion: Shop or Stop? [GluaE-_C0-Q]/Fast_speaker_SPEAKER_03/Fast_speaker_SPEAKER_03_chunk109.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/001/S001_C_0288.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_001/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_001_chunk2855.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/011/S011_A_3111.wav
+/home/ubuntu/respair/jpn/soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sakurai_takahiro/Sakurai_Takahiro_02/Sakurai_Takahiro_02_chunk1844.wav
+/home/ubuntu/respair/jpn/moe/917feebd/wav/917feebd_2407.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/#Debate: Is Florida Eating New York's Lunch? Bill de Blasio vs. Reihan Salam [Poj2dDr3n-M]/#Deb_speaker_SPEAKER_07/#Deb_speaker_SPEAKER_07_chunk111.wav
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_2207_P2.wav
+/home/ubuntu/respair/data_cache/Final_Persian/حسن آزادی/ab1eea53-b347-42c1-8663-a4691bd04a1a/ab1eea53-b347-42c1-8663-a4691bd04a1a_chunk567.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/sakura_moyu/03/03013290.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/malady/Baldur's Gate 3 Voice Lines: The Narrator [BcdezgUEnLM]/Baldur's Gate 3 Voice Lines: The Narrator [BcdezgUEnLM]_chunk1527.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Limbic System Voice Lines (Disco Elysium) [ug5DR5ylSPc]/Limbic System Voice Lines (Disco Elysium) [ug5DR5ylSPc]_chunk302.mp3
+/home/ubuntu/respair/jpn/moe/9febd2ae/wav/9febd2ae_1198.wav
+/home/ubuntu/respair/data_cache/Final_Persian/محمد امرایی/a4c4b25a-d6b2-48e2-af9f-c3606a69eae0/a4c4b25a-d6b2-48e2-af9f-c3606a69eae0_chunk382.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/ساعد باقری/8ed4d9cb-ebf4-4f5e-8960-2024091c1b55/8ed4d9cb-ebf4-4f5e-8960-2024091c1b55_chunk482.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/هكذا تكلم الذئب/هكذا تكلم الذئب_chunk1004.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ليتنا لم نلتق/ليتنا لم نلتق_chunk1035.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/عشق في بلاد الجليد/عشق في بلاد الجليد_chunk585.mp3
+/home/ubuntu/respair/jpn/moe/ee093a4f/wav/ee093a4f_0084.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/inga/inga_clean/Glava_02/Glava_02_chunk273.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/003/S003_E_0015.wav
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/sychev/Жизнь с БИПОЛЯРНЫМ РАССТРОЙСТВОМ 2 типа. Урбанистика, TEDx и психоанализ [umei-N44qqI]/Жизн_speaker_SPEAKER_06/Жизн_speaker_SPEAKER_06_chunk388.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/شفاة صامتة/شفاة صامتة_chunk1274.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/ylac/ylacombe_podcast/Ylacombe_podcast_audio_5478.wav
+/home/ubuntu/respair/data_cache/Final_Persian/منصور ضابطیان/e15de6bb-d0c4-4b73-a72a-bfd153b92c9d/e15de6bb-d0c4-4b73-a72a-bfd153b92c9d_chunk42.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أنا سنية و أنت شيعي/أنا سنية و أنت شيعي_chunk887.mp3
+/home/ubuntu/respair/jpn/moe/4e2f4ba6/wav/4e2f4ba6_1288.wav
+/home/ubuntu/respair/data_cache/Final_Persian/تینا میرکریمی/aebfec67-14a2-4672-9f31-27cd3b7febc4/aebfec67-14a2-4672-9f31-27cd3b7febc4_chunk178.mp3
+/home/ubuntu/respair/jpn/moe/ad28b91b/wav/ad28b91b_1167.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Cuno Voice Lines (Disco Elysium) [GI3nSMhAHgU]/Cuno Voice Lines (Disco Elysium) [GI3nSMhAHgU]_chunk1663.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/احسان چریکی/ead295e2-d936-4425-89db-150aa5325763/ead295e2-d936-4425-89db-150aa5325763_chunk493.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/003/S003_C_0011.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/All Vicious Mockery Insults [QhA1NwgnFN8]/All Vicious Mockery Insults [QhA1NwgnFN8]_chunk1488.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/بنت الباشا/بنت الباشا_chunk726.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/BG3 Voice Lines: Jaheira [r7HxfsGG0zo]/BG3 Voice Lines: Jaheira [r7HxfsGG0zo]_chunk1383.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/debate_chunks/Will Kamala Harris or Donald Trump be Better for America? [hfoSRFGNykQ]/Will_speaker_SPEAKER_04/Will_speaker_SPEAKER_04_chunk50.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/مريض لن انساه/مريض لن انساه_chunk1308.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Кейт Лаумер - Король города/Кейт_speaker_SPEAKER_02/Кейт_speaker_SPEAKER_02_chunk931.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/الحصان الشارد/الحصان الشارد_chunk555.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/22khz/22khz/audiobook_rus_dataset_22khz_audio_16379.wav
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_000/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_000_chunk374.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Сергей и Марина Дьяченко - Vita Nostra_part_000/Серг_speaker_SPEAKER_01/Серг_speaker_SPEAKER_01_chunk21.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/BG3 Voice Lines: Minsc [5wO9k1rgRcY]/BG3 Voice Lines: Minsc [5wO9k1rgRcY]_chunk5377.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/wth_ids/22khz/22khz/audiobook_rus_dataset_22khz_audio_304.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/رقصة المرمر/رقصة المرمر_chunk738.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_18630.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/ظلال الكولوسيوم/ظلال الكولوسيوم_chunk619.mp3
+/home/ubuntu/respair/data_cache/English_Final/with_ids/Ani_speech/audio_12749.wav
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/أرجوك أعطني هذا الدواء/أرجوك أعطني هذا الدواء_chunk367.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/أنا في أنتظبارك/أنا في أنتظبارك_chunk448.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/حامد فعال/487218ea-d36f-421a-b03d-898cbc3b1bc8/487218ea-d36f-421a-b03d-898cbc3b1bc8_chunk141.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/امیرمحمد صمصامی/92dc1a19-f918-40a5-84ce-ae0ba60aa1e5/92dc1a19-f918-40a5-84ce-ae0ba60aa1e5_chunk229.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/تحت سقف واحد/تحت سقف واحد_chunk1215.mp3
+/home/ubuntu/respair/data_cache/Final_Persian/محمدرضا علی اکبری/f6e7c8b3-f5be-48cb-8a08-34691ab81af6/f6e7c8b3-f5be-48cb-8a08-34691ab81af6_chunk937.mp3
+/home/ubuntu/respair/jpn/soshy/Japanese/tsujido/vo/003/S003_A_0346.wav
+/home/ubuntu/respair/data_cache/Final_Persian/یاسر دعاگو/6f570bc1-d61d-4bee-ad9c-602c6197091c/6f570bc1-d61d-4bee-ad9c-602c6197091c_chunk408.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/انتحار حمار/انتحار حمار_chunk1353.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/وصال الروح/وصال الروح_chunk241.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Кеннет Балмер - Страна которой нет на карте/Кенн_speaker_SPEAKER_01/Кенн_speaker_SPEAKER_01_chunk398.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/النساء لهن أسنان بيضاء/النساء لهن أسنان بيضاء_chunk683.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/audiobooks_25P/Роберт Шекли - Носитель Инфекции/Робе_speaker_SPEAKER_00/Робе_speaker_SPEAKER_00_chunk742.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/BD3/BG3_Rest/Gale/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_001/BG3 Voice Lines: Gale [CpasYgOyyl4]_part_001_chunk2653.mp3
+/home/ubuntu/respair/data_cache/English_Final/no_ids/HF_temp/games/Disco/Ancient Reptilian Brain Voice Lines (Disco Elysium) [MOYMRFmEqcg]/Ancient Reptilian Brain Voice Lines (Disco Elysium) [MOYMRFmEqcg]_chunk82.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم القصص/٣٣ بلونة حب/٣٣ بلونة حب_chunk935.mp3
+/home/ubuntu/respair/data_cache/Final_Rusiki/no_ids/Ruski_scraped_chunks/youtube/vedud/Невзоров – о Фараоне и ориентации Милонова ⧸ Alexandr Nevzorov's big interview [zcjKJ7FHDLM]/Невз_speaker_SPEAKER_03/Невз_speaker_SPEAKER_03_chunk730.mp3
+/home/ubuntu/respair/data_cache/Arabic/Processed_chnks/مجموعة كتب صوتية Storytel قسم روايات رومانسية/م لك يوسف/م لك يوسف_chunk619.mp3
diff --git a/vocos/data/filelist2.train b/vocos/data/filelist2.train
new file mode 100644
index 0000000000000000000000000000000000000000..b84e535b9aca873dfd046c6724cf52d33ebd4056
--- /dev/null
+++ b/vocos/data/filelist2.train
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c912110b830ea4d3d5b7e6a6c3a5660d7c99a36cf2d5fb12033c8abce311b259
+size 90047319
diff --git a/vocos/logs/lightning_logs/version_24/config.yaml b/vocos/logs/lightning_logs/version_24/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..058106217c4efc46dcf7556a7f517e774b24f98a
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_24/config.yaml
@@ -0,0 +1,152 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ name: lightning_logs
+ version: null
+ log_graph: false
+ default_hp_metric: true
+ prefix: ''
+ sub_dir: null
+ logdir: null
+ comment: ''
+ purge_step: null
+ max_queue: 10
+ flush_secs: 120
+ filename_suffix: ''
+ write_to_disk: true
+ comet_config:
+ disabled: true
+ enable_checkpointing: true
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ init_args:
+ logging_interval: null
+ log_momentum: false
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ dirpath: null
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ monitor: val_loss
+ verbose: false
+ save_last: true
+ save_top_k: 3
+ save_weights_only: false
+ mode: min
+ auto_insert_metric_name: true
+ every_n_train_steps: null
+ train_time_interval: null
+ every_n_epochs: null
+ save_on_train_epoch_end: null
+ - class_path: vocos.helpers.GradNormCallback
+ default_root_dir: null
+ gradient_clip_val: null
+ gradient_clip_algorithm: null
+ num_nodes: 1
+ num_processes: null
+ devices:
+ - 0
+ - 1
+ gpus: null
+ auto_select_gpus: false
+ tpu_cores: null
+ ipus: null
+ enable_progress_bar: true
+ overfit_batches: 0.0
+ track_grad_norm: -1
+ check_val_every_n_epoch: 1
+ fast_dev_run: false
+ accumulate_grad_batches: null
+ max_epochs: null
+ min_epochs: null
+ max_steps: 2000000
+ min_steps: null
+ max_time: null
+ limit_train_batches: null
+ limit_val_batches: 50
+ limit_test_batches: null
+ limit_predict_batches: null
+ val_check_interval: null
+ log_every_n_steps: 100
+ accelerator: gpu
+ strategy: ddp
+ sync_batchnorm: false
+ precision: 32
+ enable_model_summary: true
+ num_sanity_val_steps: 2
+ resume_from_checkpoint: null
+ profiler: null
+ benchmark: null
+ deterministic: null
+ reload_dataloaders_every_n_epochs: 0
+ auto_lr_find: false
+ replace_sampler_ddp: true
+ detect_anomaly: false
+ auto_scale_batch_size: false
+ plugins: null
+ amp_backend: native
+ amp_level: null
+ move_metrics_to_cpu: false
+ multiple_trainloader_mode: max_size_cycle
+ inference_mode: true
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 44100
+ n_fft: 2048
+ hop_length: 512
+ win_length: 2048
+ n_mels: 128
+ padding: center
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 128
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+ layer_scale_init_value: null
+ adanorm_num_embeddings: null
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 512
+ padding: center
+ sample_rate: 44100
+ initial_learning_rate: 0.0005
+ num_warmup_steps: 0
+ mel_loss_coeff: 45.0
+ mrd_loss_coeff: 0.1
+ pretrain_mel_steps: 0
+ decay_mel_coeff: false
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.train
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 58
+ num_workers: 8
+ val_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.val
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 16
+ num_workers: 8
+optimizer: null
+lr_scheduler: null
diff --git a/vocos/logs/lightning_logs/version_24/events.out.tfevents.1738102892.104-171-202-79 b/vocos/logs/lightning_logs/version_24/events.out.tfevents.1738102892.104-171-202-79
new file mode 100644
index 0000000000000000000000000000000000000000..9659b931effb849525e6f39a32dca3ed28612caf
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_24/events.out.tfevents.1738102892.104-171-202-79
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd6ece1c3c30eafe201be7ae47f1915108cbcaa4c5c86e5ca9a3cc0d73d08d12
+size 824
diff --git a/vocos/logs/lightning_logs/version_24/hparams.yaml b/vocos/logs/lightning_logs/version_24/hparams.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8311f1e45d6f32b6a7d395ec763260e8cf8f58c0
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_24/hparams.yaml
@@ -0,0 +1,10 @@
+sample_rate: 44100
+initial_learning_rate: 0.0005
+num_warmup_steps: 0
+mel_loss_coeff: 45.0
+mrd_loss_coeff: 0.1
+pretrain_mel_steps: 0
+decay_mel_coeff: false
+evaluate_utmos: true
+evaluate_pesq: true
+evaluate_periodicty: true
diff --git a/vocos/logs/lightning_logs/version_25/checkpoints/last.ckpt b/vocos/logs/lightning_logs/version_25/checkpoints/last.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..a51f79f6e18ab62e9802bb193dae4eb1ab6f5908
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_25/checkpoints/last.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b5b8380daa833cc2e56b967c6eeab7c3d4afb2839a0ebcbf0e69ce2ef0caf82
+size 681716271
diff --git a/vocos/logs/lightning_logs/version_25/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.2461.ckpt b/vocos/logs/lightning_logs/version_25/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.2461.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..01e95c0d2e4440e1f950443b340a5ea4305dd891
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_25/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.2461.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47ff3d96147a65ce0868c0d5314dd5a47a644aa4849cb05d64c619294c91de86
+size 681715824
diff --git a/vocos/logs/lightning_logs/version_25/checkpoints/vocos_checkpoint_epoch=1_step=33268_val_loss=5.4846.ckpt b/vocos/logs/lightning_logs/version_25/checkpoints/vocos_checkpoint_epoch=1_step=33268_val_loss=5.4846.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..a51f79f6e18ab62e9802bb193dae4eb1ab6f5908
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_25/checkpoints/vocos_checkpoint_epoch=1_step=33268_val_loss=5.4846.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b5b8380daa833cc2e56b967c6eeab7c3d4afb2839a0ebcbf0e69ce2ef0caf82
+size 681716271
diff --git a/vocos/logs/lightning_logs/version_25/config.yaml b/vocos/logs/lightning_logs/version_25/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..058106217c4efc46dcf7556a7f517e774b24f98a
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_25/config.yaml
@@ -0,0 +1,152 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ name: lightning_logs
+ version: null
+ log_graph: false
+ default_hp_metric: true
+ prefix: ''
+ sub_dir: null
+ logdir: null
+ comment: ''
+ purge_step: null
+ max_queue: 10
+ flush_secs: 120
+ filename_suffix: ''
+ write_to_disk: true
+ comet_config:
+ disabled: true
+ enable_checkpointing: true
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ init_args:
+ logging_interval: null
+ log_momentum: false
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ dirpath: null
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ monitor: val_loss
+ verbose: false
+ save_last: true
+ save_top_k: 3
+ save_weights_only: false
+ mode: min
+ auto_insert_metric_name: true
+ every_n_train_steps: null
+ train_time_interval: null
+ every_n_epochs: null
+ save_on_train_epoch_end: null
+ - class_path: vocos.helpers.GradNormCallback
+ default_root_dir: null
+ gradient_clip_val: null
+ gradient_clip_algorithm: null
+ num_nodes: 1
+ num_processes: null
+ devices:
+ - 0
+ - 1
+ gpus: null
+ auto_select_gpus: false
+ tpu_cores: null
+ ipus: null
+ enable_progress_bar: true
+ overfit_batches: 0.0
+ track_grad_norm: -1
+ check_val_every_n_epoch: 1
+ fast_dev_run: false
+ accumulate_grad_batches: null
+ max_epochs: null
+ min_epochs: null
+ max_steps: 2000000
+ min_steps: null
+ max_time: null
+ limit_train_batches: null
+ limit_val_batches: 50
+ limit_test_batches: null
+ limit_predict_batches: null
+ val_check_interval: null
+ log_every_n_steps: 100
+ accelerator: gpu
+ strategy: ddp
+ sync_batchnorm: false
+ precision: 32
+ enable_model_summary: true
+ num_sanity_val_steps: 2
+ resume_from_checkpoint: null
+ profiler: null
+ benchmark: null
+ deterministic: null
+ reload_dataloaders_every_n_epochs: 0
+ auto_lr_find: false
+ replace_sampler_ddp: true
+ detect_anomaly: false
+ auto_scale_batch_size: false
+ plugins: null
+ amp_backend: native
+ amp_level: null
+ move_metrics_to_cpu: false
+ multiple_trainloader_mode: max_size_cycle
+ inference_mode: true
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 44100
+ n_fft: 2048
+ hop_length: 512
+ win_length: 2048
+ n_mels: 128
+ padding: center
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 128
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+ layer_scale_init_value: null
+ adanorm_num_embeddings: null
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 512
+ padding: center
+ sample_rate: 44100
+ initial_learning_rate: 0.0005
+ num_warmup_steps: 0
+ mel_loss_coeff: 45.0
+ mrd_loss_coeff: 0.1
+ pretrain_mel_steps: 0
+ decay_mel_coeff: false
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.train
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 58
+ num_workers: 8
+ val_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.val
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 16
+ num_workers: 8
+optimizer: null
+lr_scheduler: null
diff --git a/vocos/logs/lightning_logs/version_25/events.out.tfevents.1738103019.104-171-202-79 b/vocos/logs/lightning_logs/version_25/events.out.tfevents.1738103019.104-171-202-79
new file mode 100644
index 0000000000000000000000000000000000000000..9e178fcb38649aa7dc123a890a4bf54f4903ea66
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_25/events.out.tfevents.1738103019.104-171-202-79
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f02506aa015b3928338d546f306e461971e99dcb31e66a265d647f63ce46295
+size 7661756
diff --git a/vocos/logs/lightning_logs/version_25/hparams.yaml b/vocos/logs/lightning_logs/version_25/hparams.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8311f1e45d6f32b6a7d395ec763260e8cf8f58c0
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_25/hparams.yaml
@@ -0,0 +1,10 @@
+sample_rate: 44100
+initial_learning_rate: 0.0005
+num_warmup_steps: 0
+mel_loss_coeff: 45.0
+mrd_loss_coeff: 0.1
+pretrain_mel_steps: 0
+decay_mel_coeff: false
+evaluate_utmos: true
+evaluate_pesq: true
+evaluate_periodicty: true
diff --git a/vocos/logs/lightning_logs/version_26/checkpoints/last.ckpt b/vocos/logs/lightning_logs/version_26/checkpoints/last.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..be2e1253b698a0e4476d1fbcaa40cea15a2515d8
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_26/checkpoints/last.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3720eb1e5ee6c1ba76d45f73b1661c0285705d570ed08ee78263fdd2bb16954
+size 681715888
diff --git a/vocos/logs/lightning_logs/version_26/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.8451.ckpt b/vocos/logs/lightning_logs/version_26/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.8451.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..df06caed6c4887df3fbea686667ba9120c9ff7ce
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_26/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.8451.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2499f5ade99900680bcc17d2e14987f91661c14cd925c7435c8fcd071323942a
+size 681715824
diff --git a/vocos/logs/lightning_logs/version_26/config.yaml b/vocos/logs/lightning_logs/version_26/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..058106217c4efc46dcf7556a7f517e774b24f98a
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_26/config.yaml
@@ -0,0 +1,152 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ name: lightning_logs
+ version: null
+ log_graph: false
+ default_hp_metric: true
+ prefix: ''
+ sub_dir: null
+ logdir: null
+ comment: ''
+ purge_step: null
+ max_queue: 10
+ flush_secs: 120
+ filename_suffix: ''
+ write_to_disk: true
+ comet_config:
+ disabled: true
+ enable_checkpointing: true
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ init_args:
+ logging_interval: null
+ log_momentum: false
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ dirpath: null
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ monitor: val_loss
+ verbose: false
+ save_last: true
+ save_top_k: 3
+ save_weights_only: false
+ mode: min
+ auto_insert_metric_name: true
+ every_n_train_steps: null
+ train_time_interval: null
+ every_n_epochs: null
+ save_on_train_epoch_end: null
+ - class_path: vocos.helpers.GradNormCallback
+ default_root_dir: null
+ gradient_clip_val: null
+ gradient_clip_algorithm: null
+ num_nodes: 1
+ num_processes: null
+ devices:
+ - 0
+ - 1
+ gpus: null
+ auto_select_gpus: false
+ tpu_cores: null
+ ipus: null
+ enable_progress_bar: true
+ overfit_batches: 0.0
+ track_grad_norm: -1
+ check_val_every_n_epoch: 1
+ fast_dev_run: false
+ accumulate_grad_batches: null
+ max_epochs: null
+ min_epochs: null
+ max_steps: 2000000
+ min_steps: null
+ max_time: null
+ limit_train_batches: null
+ limit_val_batches: 50
+ limit_test_batches: null
+ limit_predict_batches: null
+ val_check_interval: null
+ log_every_n_steps: 100
+ accelerator: gpu
+ strategy: ddp
+ sync_batchnorm: false
+ precision: 32
+ enable_model_summary: true
+ num_sanity_val_steps: 2
+ resume_from_checkpoint: null
+ profiler: null
+ benchmark: null
+ deterministic: null
+ reload_dataloaders_every_n_epochs: 0
+ auto_lr_find: false
+ replace_sampler_ddp: true
+ detect_anomaly: false
+ auto_scale_batch_size: false
+ plugins: null
+ amp_backend: native
+ amp_level: null
+ move_metrics_to_cpu: false
+ multiple_trainloader_mode: max_size_cycle
+ inference_mode: true
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 44100
+ n_fft: 2048
+ hop_length: 512
+ win_length: 2048
+ n_mels: 128
+ padding: center
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 128
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+ layer_scale_init_value: null
+ adanorm_num_embeddings: null
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 512
+ padding: center
+ sample_rate: 44100
+ initial_learning_rate: 0.0005
+ num_warmup_steps: 0
+ mel_loss_coeff: 45.0
+ mrd_loss_coeff: 0.1
+ pretrain_mel_steps: 0
+ decay_mel_coeff: false
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.train
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 58
+ num_workers: 8
+ val_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.val
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 16
+ num_workers: 8
+optimizer: null
+lr_scheduler: null
diff --git a/vocos/logs/lightning_logs/version_26/events.out.tfevents.1738103077.104-171-202-79 b/vocos/logs/lightning_logs/version_26/events.out.tfevents.1738103077.104-171-202-79
new file mode 100644
index 0000000000000000000000000000000000000000..edb4978a645059336a83d554d4229b96d36c225c
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_26/events.out.tfevents.1738103077.104-171-202-79
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48c17ee2d5df22160ac1d62a41c9db5957f60652875309e4b5f315bf25c0452b
+size 5746663
diff --git a/vocos/logs/lightning_logs/version_26/hparams.yaml b/vocos/logs/lightning_logs/version_26/hparams.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8311f1e45d6f32b6a7d395ec763260e8cf8f58c0
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_26/hparams.yaml
@@ -0,0 +1,10 @@
+sample_rate: 44100
+initial_learning_rate: 0.0005
+num_warmup_steps: 0
+mel_loss_coeff: 45.0
+mrd_loss_coeff: 0.1
+pretrain_mel_steps: 0
+decay_mel_coeff: false
+evaluate_utmos: true
+evaluate_pesq: true
+evaluate_periodicty: true
diff --git a/vocos/logs/lightning_logs/version_27/checkpoints/last.ckpt b/vocos/logs/lightning_logs/version_27/checkpoints/last.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..2d4cf7a32229e086075fa565121d5a93a71574f4
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_27/checkpoints/last.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9c52539e953c4675d8f68db881f7c7f3afd6abeecc9932b288c71adf3ab487a
+size 681715888
diff --git a/vocos/logs/lightning_logs/version_27/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.0317.ckpt b/vocos/logs/lightning_logs/version_27/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.0317.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..64a5a0b50327e69d89f78a4c0f4a5aea55baf085
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_27/checkpoints/vocos_checkpoint_epoch=0_step=16634_val_loss=6.0317.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3196e7caa913048385c6806a8fce5b7ee675a77bdacc83c3b84f50a9d557581e
+size 681715824
diff --git a/vocos/logs/lightning_logs/version_27/config.yaml b/vocos/logs/lightning_logs/version_27/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..058106217c4efc46dcf7556a7f517e774b24f98a
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_27/config.yaml
@@ -0,0 +1,152 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ name: lightning_logs
+ version: null
+ log_graph: false
+ default_hp_metric: true
+ prefix: ''
+ sub_dir: null
+ logdir: null
+ comment: ''
+ purge_step: null
+ max_queue: 10
+ flush_secs: 120
+ filename_suffix: ''
+ write_to_disk: true
+ comet_config:
+ disabled: true
+ enable_checkpointing: true
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ init_args:
+ logging_interval: null
+ log_momentum: false
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ dirpath: null
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ monitor: val_loss
+ verbose: false
+ save_last: true
+ save_top_k: 3
+ save_weights_only: false
+ mode: min
+ auto_insert_metric_name: true
+ every_n_train_steps: null
+ train_time_interval: null
+ every_n_epochs: null
+ save_on_train_epoch_end: null
+ - class_path: vocos.helpers.GradNormCallback
+ default_root_dir: null
+ gradient_clip_val: null
+ gradient_clip_algorithm: null
+ num_nodes: 1
+ num_processes: null
+ devices:
+ - 0
+ - 1
+ gpus: null
+ auto_select_gpus: false
+ tpu_cores: null
+ ipus: null
+ enable_progress_bar: true
+ overfit_batches: 0.0
+ track_grad_norm: -1
+ check_val_every_n_epoch: 1
+ fast_dev_run: false
+ accumulate_grad_batches: null
+ max_epochs: null
+ min_epochs: null
+ max_steps: 2000000
+ min_steps: null
+ max_time: null
+ limit_train_batches: null
+ limit_val_batches: 50
+ limit_test_batches: null
+ limit_predict_batches: null
+ val_check_interval: null
+ log_every_n_steps: 100
+ accelerator: gpu
+ strategy: ddp
+ sync_batchnorm: false
+ precision: 32
+ enable_model_summary: true
+ num_sanity_val_steps: 2
+ resume_from_checkpoint: null
+ profiler: null
+ benchmark: null
+ deterministic: null
+ reload_dataloaders_every_n_epochs: 0
+ auto_lr_find: false
+ replace_sampler_ddp: true
+ detect_anomaly: false
+ auto_scale_batch_size: false
+ plugins: null
+ amp_backend: native
+ amp_level: null
+ move_metrics_to_cpu: false
+ multiple_trainloader_mode: max_size_cycle
+ inference_mode: true
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 44100
+ n_fft: 2048
+ hop_length: 512
+ win_length: 2048
+ n_mels: 128
+ padding: center
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 128
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+ layer_scale_init_value: null
+ adanorm_num_embeddings: null
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 512
+ padding: center
+ sample_rate: 44100
+ initial_learning_rate: 0.0005
+ num_warmup_steps: 0
+ mel_loss_coeff: 45.0
+ mrd_loss_coeff: 0.1
+ pretrain_mel_steps: 0
+ decay_mel_coeff: false
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.train
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 58
+ num_workers: 8
+ val_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.val
+ sampling_rate: 44100
+ num_samples: 65536
+ batch_size: 16
+ num_workers: 8
+optimizer: null
+lr_scheduler: null
diff --git a/vocos/logs/lightning_logs/version_27/events.out.tfevents.1738146591.104-171-202-79 b/vocos/logs/lightning_logs/version_27/events.out.tfevents.1738146591.104-171-202-79
new file mode 100644
index 0000000000000000000000000000000000000000..6f08e01847aaebd0ab5ed4777d6f5f5c09063766
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_27/events.out.tfevents.1738146591.104-171-202-79
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57d65dca6e2d50879800973f412ed4da5c334ebfbd70f475dea1988880d1d01e
+size 4136929
diff --git a/vocos/logs/lightning_logs/version_27/hparams.yaml b/vocos/logs/lightning_logs/version_27/hparams.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8311f1e45d6f32b6a7d395ec763260e8cf8f58c0
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_27/hparams.yaml
@@ -0,0 +1,10 @@
+sample_rate: 44100
+initial_learning_rate: 0.0005
+num_warmup_steps: 0
+mel_loss_coeff: 45.0
+mrd_loss_coeff: 0.1
+pretrain_mel_steps: 0
+decay_mel_coeff: false
+evaluate_utmos: true
+evaluate_pesq: true
+evaluate_periodicty: true
diff --git a/vocos/logs/lightning_logs/version_28/config.yaml b/vocos/logs/lightning_logs/version_28/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c56570e5f1c1d64ab988a2aee02a7a1ca73bda9
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_28/config.yaml
@@ -0,0 +1,152 @@
+# pytorch_lightning==1.8.6
+seed_everything: 4444
+trainer:
+ logger:
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
+ init_args:
+ save_dir: logs/
+ name: lightning_logs
+ version: null
+ log_graph: false
+ default_hp_metric: true
+ prefix: ''
+ sub_dir: null
+ logdir: null
+ comment: ''
+ purge_step: null
+ max_queue: 10
+ flush_secs: 120
+ filename_suffix: ''
+ write_to_disk: true
+ comet_config:
+ disabled: true
+ enable_checkpointing: true
+ callbacks:
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+ init_args:
+ logging_interval: null
+ log_momentum: false
+ - class_path: pytorch_lightning.callbacks.ModelSummary
+ init_args:
+ max_depth: 2
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+ init_args:
+ dirpath: null
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
+ monitor: val_loss
+ verbose: false
+ save_last: true
+ save_top_k: 3
+ save_weights_only: false
+ mode: min
+ auto_insert_metric_name: true
+ every_n_train_steps: null
+ train_time_interval: null
+ every_n_epochs: null
+ save_on_train_epoch_end: null
+ - class_path: vocos.helpers.GradNormCallback
+ default_root_dir: null
+ gradient_clip_val: null
+ gradient_clip_algorithm: null
+ num_nodes: 1
+ num_processes: null
+ devices:
+ - 0
+ - 1
+ gpus: null
+ auto_select_gpus: false
+ tpu_cores: null
+ ipus: null
+ enable_progress_bar: true
+ overfit_batches: 0.0
+ track_grad_norm: -1
+ check_val_every_n_epoch: 1
+ fast_dev_run: false
+ accumulate_grad_batches: null
+ max_epochs: null
+ min_epochs: null
+ max_steps: 2000000
+ min_steps: null
+ max_time: null
+ limit_train_batches: null
+ limit_val_batches: 50
+ limit_test_batches: null
+ limit_predict_batches: null
+ val_check_interval: null
+ log_every_n_steps: 100
+ accelerator: gpu
+ strategy: ddp
+ sync_batchnorm: false
+ precision: 32
+ enable_model_summary: true
+ num_sanity_val_steps: 2
+ resume_from_checkpoint: null
+ profiler: null
+ benchmark: null
+ deterministic: null
+ reload_dataloaders_every_n_epochs: 0
+ auto_lr_find: false
+ replace_sampler_ddp: true
+ detect_anomaly: false
+ auto_scale_batch_size: false
+ plugins: null
+ amp_backend: native
+ amp_level: null
+ move_metrics_to_cpu: false
+ multiple_trainloader_mode: max_size_cycle
+ inference_mode: true
+model:
+ class_path: vocos.experiment.VocosExp
+ init_args:
+ feature_extractor:
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
+ init_args:
+ sample_rate: 24000
+ n_fft: 2048
+ hop_length: 300
+ win_length: 1200
+ n_mels: 80
+ padding: center
+ backbone:
+ class_path: vocos.models.VocosBackbone
+ init_args:
+ input_channels: 80
+ dim: 512
+ intermediate_dim: 1536
+ num_layers: 8
+ layer_scale_init_value: null
+ adanorm_num_embeddings: null
+ head:
+ class_path: vocos.heads.ISTFTHead
+ init_args:
+ dim: 512
+ n_fft: 2048
+ hop_length: 300
+ padding: center
+ sample_rate: 24000
+ initial_learning_rate: 0.0005
+ num_warmup_steps: 0
+ mel_loss_coeff: 45.0
+ mrd_loss_coeff: 0.1
+ pretrain_mel_steps: 0
+ decay_mel_coeff: false
+ evaluate_utmos: true
+ evaluate_pesq: true
+ evaluate_periodicty: true
+data:
+ class_path: vocos.dataset.VocosDataModule
+ init_args:
+ train_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist2.train
+ sampling_rate: 24000
+ num_samples: 57600
+ batch_size: 64
+ num_workers: 8
+ val_params:
+ filelist_path: /home/ubuntu/vocos/data/filelist.val
+ sampling_rate: 24000
+ num_samples: 57600
+ batch_size: 16
+ num_workers: 8
+optimizer: null
+lr_scheduler: null
diff --git a/vocos/logs/lightning_logs/version_28/events.out.tfevents.1738155430.104-171-202-79 b/vocos/logs/lightning_logs/version_28/events.out.tfevents.1738155430.104-171-202-79
new file mode 100644
index 0000000000000000000000000000000000000000..3d882cb551f5a073a3a7b625db4f2e94ea55d58e
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_28/events.out.tfevents.1738155430.104-171-202-79
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b4a7316956f5ead36f26b752c57b4666029b303700a235499d5430b0e1075d2
+size 698760
diff --git a/vocos/logs/lightning_logs/version_28/hparams.yaml b/vocos/logs/lightning_logs/version_28/hparams.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..613901794afd33a59fd578a51f8fda5d9f6e44cc
--- /dev/null
+++ b/vocos/logs/lightning_logs/version_28/hparams.yaml
@@ -0,0 +1,10 @@
+sample_rate: 24000
+initial_learning_rate: 0.0005
+num_warmup_steps: 0
+mel_loss_coeff: 45.0
+mrd_loss_coeff: 0.1
+pretrain_mel_steps: 0
+decay_mel_coeff: false
+evaluate_utmos: true
+evaluate_pesq: true
+evaluate_periodicty: true
diff --git a/vocos/metrics/UTMOS.py b/vocos/metrics/UTMOS.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c42e8a956ba5787b06c1d5cab7d47194416e759
--- /dev/null
+++ b/vocos/metrics/UTMOS.py
@@ -0,0 +1,223 @@
+import os
+
+import fairseq
+import pytorch_lightning as pl
+import requests
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+UTMOS_CKPT_URL = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo/resolve/main/epoch%3D3-step%3D7459.ckpt"
+WAV2VEC_URL = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo/resolve/main/wav2vec_small.pt"
+
+"""
+UTMOS score, automatic Mean Opinion Score (MOS) prediction system,
+adapted from https://huggingface.co/spaces/sarulab-speech/UTMOS-demo
+"""
+
+
+class UTMOSScore:
+ """Predicting score for each audio clip."""
+
+ def __init__(self, device, ckpt_path="epoch=3-step=7459.ckpt"):
+ self.device = device
+ filepath = os.path.join(os.path.dirname(__file__), ckpt_path)
+ if not os.path.exists(filepath):
+ download_file(UTMOS_CKPT_URL, filepath)
+ self.model = BaselineLightningModule.load_from_checkpoint(filepath).eval().to(device)
+
+ def score(self, wavs: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ wavs: audio waveform to be evaluated. When len(wavs) == 1 or 2,
+ the model processes the input as a single audio clip. The model
+ performs batch processing when len(wavs) == 3.
+ """
+ if len(wavs.shape) == 1:
+ out_wavs = wavs.unsqueeze(0).unsqueeze(0)
+ elif len(wavs.shape) == 2:
+ out_wavs = wavs.unsqueeze(0)
+ elif len(wavs.shape) == 3:
+ out_wavs = wavs
+ else:
+ raise ValueError("Dimension of input tensor needs to be <= 3.")
+ bs = out_wavs.shape[0]
+ batch = {
+ "wav": out_wavs,
+ "domains": torch.zeros(bs, dtype=torch.int).to(self.device),
+ "judge_id": torch.ones(bs, dtype=torch.int).to(self.device) * 288,
+ }
+ with torch.no_grad():
+ output = self.model(batch)
+
+ return output.mean(dim=1).squeeze(1).cpu().detach() * 2 + 3
+
+
+def download_file(url, filename):
+ """
+ Downloads a file from the given URL
+
+ Args:
+ url (str): The URL of the file to download.
+ filename (str): The name to save the file as.
+ """
+ print(f"Downloading file {filename}...")
+ response = requests.get(url, stream=True)
+ response.raise_for_status()
+
+ total_size_in_bytes = int(response.headers.get("content-length", 0))
+ progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+
+ with open(filename, "wb") as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ progress_bar.update(len(chunk))
+ f.write(chunk)
+
+ progress_bar.close()
+
+
+def load_ssl_model(ckpt_path="wav2vec_small.pt"):
+ filepath = os.path.join(os.path.dirname(__file__), ckpt_path)
+ if not os.path.exists(filepath):
+ download_file(WAV2VEC_URL, filepath)
+ SSL_OUT_DIM = 768
+ model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([filepath])
+ ssl_model = model[0]
+ ssl_model.remove_pretraining_modules()
+ return SSL_model(ssl_model, SSL_OUT_DIM)
+
+
+class BaselineLightningModule(pl.LightningModule):
+ def __init__(self, cfg):
+ super().__init__()
+ self.cfg = cfg
+ self.construct_model()
+ self.save_hyperparameters()
+
+ def construct_model(self):
+ self.feature_extractors = nn.ModuleList(
+ [load_ssl_model(ckpt_path="wav2vec_small.pt"), DomainEmbedding(3, 128),]
+ )
+ output_dim = sum([feature_extractor.get_output_dim() for feature_extractor in self.feature_extractors])
+ output_layers = [LDConditioner(judge_dim=128, num_judges=3000, input_dim=output_dim)]
+ output_dim = output_layers[-1].get_output_dim()
+ output_layers.append(
+ Projection(hidden_dim=2048, activation=torch.nn.ReLU(), range_clipping=False, input_dim=output_dim)
+ )
+
+ self.output_layers = nn.ModuleList(output_layers)
+
+ def forward(self, inputs):
+ outputs = {}
+ for feature_extractor in self.feature_extractors:
+ outputs.update(feature_extractor(inputs))
+ x = outputs
+ for output_layer in self.output_layers:
+ x = output_layer(x, inputs)
+ return x
+
+
+class SSL_model(nn.Module):
+ def __init__(self, ssl_model, ssl_out_dim) -> None:
+ super(SSL_model, self).__init__()
+ self.ssl_model, self.ssl_out_dim = ssl_model, ssl_out_dim
+
+ def forward(self, batch):
+ wav = batch["wav"]
+ wav = wav.squeeze(1) # [batches, audio_len]
+ res = self.ssl_model(wav, mask=False, features_only=True)
+ x = res["x"]
+ return {"ssl-feature": x}
+
+ def get_output_dim(self):
+ return self.ssl_out_dim
+
+
+class DomainEmbedding(nn.Module):
+ def __init__(self, n_domains, domain_dim) -> None:
+ super().__init__()
+ self.embedding = nn.Embedding(n_domains, domain_dim)
+ self.output_dim = domain_dim
+
+ def forward(self, batch):
+ return {"domain-feature": self.embedding(batch["domains"])}
+
+ def get_output_dim(self):
+ return self.output_dim
+
+
+class LDConditioner(nn.Module):
+ """
+ Conditions ssl output by listener embedding
+ """
+
+ def __init__(self, input_dim, judge_dim, num_judges=None):
+ super().__init__()
+ self.input_dim = input_dim
+ self.judge_dim = judge_dim
+ self.num_judges = num_judges
+ assert num_judges != None
+ self.judge_embedding = nn.Embedding(num_judges, self.judge_dim)
+ # concat [self.output_layer, phoneme features]
+
+ self.decoder_rnn = nn.LSTM(
+ input_size=self.input_dim + self.judge_dim,
+ hidden_size=512,
+ num_layers=1,
+ batch_first=True,
+ bidirectional=True,
+ ) # linear?
+ self.out_dim = self.decoder_rnn.hidden_size * 2
+
+ def get_output_dim(self):
+ return self.out_dim
+
+ def forward(self, x, batch):
+ judge_ids = batch["judge_id"]
+ if "phoneme-feature" in x.keys():
+ concatenated_feature = torch.cat(
+ (x["ssl-feature"], x["phoneme-feature"].unsqueeze(1).expand(-1, x["ssl-feature"].size(1), -1)), dim=2
+ )
+ else:
+ concatenated_feature = x["ssl-feature"]
+ if "domain-feature" in x.keys():
+ concatenated_feature = torch.cat(
+ (concatenated_feature, x["domain-feature"].unsqueeze(1).expand(-1, concatenated_feature.size(1), -1),),
+ dim=2,
+ )
+ if judge_ids != None:
+ concatenated_feature = torch.cat(
+ (
+ concatenated_feature,
+ self.judge_embedding(judge_ids).unsqueeze(1).expand(-1, concatenated_feature.size(1), -1),
+ ),
+ dim=2,
+ )
+ decoder_output, (h, c) = self.decoder_rnn(concatenated_feature)
+ return decoder_output
+
+
+class Projection(nn.Module):
+ def __init__(self, input_dim, hidden_dim, activation, range_clipping=False):
+ super(Projection, self).__init__()
+ self.range_clipping = range_clipping
+ output_dim = 1
+ if range_clipping:
+ self.proj = nn.Tanh()
+
+ self.net = nn.Sequential(
+ nn.Linear(input_dim, hidden_dim), activation, nn.Dropout(0.3), nn.Linear(hidden_dim, output_dim),
+ )
+ self.output_dim = output_dim
+
+ def forward(self, x, batch):
+ output = self.net(x)
+
+ # range clipping
+ if self.range_clipping:
+ return self.proj(output) * 2.0 + 3
+ else:
+ return output
+
+ def get_output_dim(self):
+ return self.output_dim
diff --git a/vocos/metrics/__pycache__/UTMOS.cpython-311.pyc b/vocos/metrics/__pycache__/UTMOS.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebbd87c093acdbc75b60bbcb62f06a77721a1bf4
Binary files /dev/null and b/vocos/metrics/__pycache__/UTMOS.cpython-311.pyc differ
diff --git a/vocos/metrics/__pycache__/periodicity.cpython-311.pyc b/vocos/metrics/__pycache__/periodicity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b5005299075b6ee9ec5c0dd9f7ece1e6e46f71c
Binary files /dev/null and b/vocos/metrics/__pycache__/periodicity.cpython-311.pyc differ
diff --git a/vocos/metrics/epoch=3-step=7459.ckpt b/vocos/metrics/epoch=3-step=7459.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..504f08896292458bfc24308dacb95cf9ddb4a92b
--- /dev/null
+++ b/vocos/metrics/epoch=3-step=7459.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44c57e3e4135a243b43d2c82b6a693fcd56f15f9ad0e1eb2a8b31fdecd3a49b8
+size 1238128841
diff --git a/vocos/metrics/periodicity.py b/vocos/metrics/periodicity.py
new file mode 100644
index 0000000000000000000000000000000000000000..728017cc79098661a843294005bbc0059bc4d862
--- /dev/null
+++ b/vocos/metrics/periodicity.py
@@ -0,0 +1,105 @@
+import librosa
+import numpy as np
+import torch
+import torchaudio
+import torchcrepe
+from torchcrepe.loudness import REF_DB
+
+SILENCE_THRESHOLD = -60
+UNVOICED_THRESHOLD = 0.21
+
+"""
+Periodicity metrics adapted from https://github.com/descriptinc/cargan
+"""
+
+
+def predict_pitch(
+ audio: torch.Tensor, silence_threshold: float = SILENCE_THRESHOLD, unvoiced_treshold: float = UNVOICED_THRESHOLD
+):
+ """
+ Predicts pitch and periodicity for the given audio.
+
+ Args:
+ audio (Tensor): The audio waveform.
+ silence_threshold (float): The threshold for silence detection.
+ unvoiced_treshold (float): The threshold for unvoiced detection.
+
+ Returns:
+ pitch (ndarray): The predicted pitch.
+ periodicity (ndarray): The predicted periodicity.
+ """
+ # torchcrepe inference
+ pitch, periodicity = torchcrepe.predict(
+ audio,
+ fmin=50.0,
+ fmax=550,
+ sample_rate=torchcrepe.SAMPLE_RATE,
+ model="full",
+ return_periodicity=True,
+ device=audio.device,
+ pad=False,
+ )
+ pitch = pitch.cpu().numpy()
+ periodicity = periodicity.cpu().numpy()
+
+ # Calculate dB-scaled spectrogram and set low energy frames to unvoiced
+ hop_length = torchcrepe.SAMPLE_RATE // 100 # default CREPE
+ stft = torchaudio.functional.spectrogram(
+ audio,
+ window=torch.hann_window(torchcrepe.WINDOW_SIZE, device=audio.device),
+ n_fft=torchcrepe.WINDOW_SIZE,
+ hop_length=hop_length,
+ win_length=torchcrepe.WINDOW_SIZE,
+ power=2,
+ normalized=False,
+ pad=0,
+ center=False,
+ )
+
+ # Perceptual weighting
+ freqs = librosa.fft_frequencies(sr=torchcrepe.SAMPLE_RATE, n_fft=torchcrepe.WINDOW_SIZE)
+ perceptual_stft = librosa.perceptual_weighting(stft.cpu().numpy(), freqs) - REF_DB
+ silence = perceptual_stft.mean(axis=1) < silence_threshold
+
+ periodicity[silence] = 0
+ pitch[periodicity < unvoiced_treshold] = torchcrepe.UNVOICED
+
+ return pitch, periodicity
+
+
+def calculate_periodicity_metrics(y: torch.Tensor, y_hat: torch.Tensor):
+ """
+ Calculates periodicity metrics for the predicted and true audio data.
+
+ Args:
+ y (Tensor): The true audio data.
+ y_hat (Tensor): The predicted audio data.
+
+ Returns:
+ periodicity_loss (float): The periodicity loss.
+ pitch_loss (float): The pitch loss.
+ f1 (float): The F1 score for voiced/unvoiced classification
+ """
+ true_pitch, true_periodicity = predict_pitch(y)
+ pred_pitch, pred_periodicity = predict_pitch(y_hat)
+
+ true_voiced = ~np.isnan(true_pitch)
+ pred_voiced = ~np.isnan(pred_pitch)
+
+ periodicity_loss = np.sqrt(((pred_periodicity - true_periodicity) ** 2).mean(axis=1)).mean()
+
+ # Update pitch rmse
+ voiced = true_voiced & pred_voiced
+ difference_cents = 1200 * (np.log2(true_pitch[voiced]) - np.log2(pred_pitch[voiced]))
+ pitch_loss = np.sqrt((difference_cents ** 2).mean())
+
+ # voiced/unvoiced precision and recall
+ true_positives = (true_voiced & pred_voiced).sum()
+ false_positives = (~true_voiced & pred_voiced).sum()
+ false_negatives = (true_voiced & ~pred_voiced).sum()
+
+ precision = true_positives / (true_positives + false_positives)
+ recall = true_positives / (true_positives + false_negatives)
+ f1 = 2 * precision * recall / (precision + recall)
+
+ return periodicity_loss, pitch_loss, f1
diff --git a/vocos/metrics/wav2vec_small.pt b/vocos/metrics/wav2vec_small.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4eb311443cb3c8ecc1d379b5d93dc10569fb8c9e
--- /dev/null
+++ b/vocos/metrics/wav2vec_small.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c66c39eaed1b79a61ea8573f71e08f6641ff156b6a8f458cfaab53877dfa4a26
+size 950500491
diff --git a/vocos/notebooks/Bark+Vocos.ipynb b/vocos/notebooks/Bark+Vocos.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e8bc85c5d3c4545c481b60acabc124dae5fbe47a
--- /dev/null
+++ b/vocos/notebooks/Bark+Vocos.ipynb
@@ -0,0 +1,264 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "private_outputs": true,
+ "provenance": [],
+ "gpuType": "T4",
+ "authorship_tag": "ABX9TyMC53IsYoVJIVijVzw3ADvX",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "accelerator": "GPU"
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Text-to-Audio Synthesis using Bark and Vocos"
+ ],
+ "metadata": {
+ "id": "NuRzVtHDZ_Gl"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "In this notebook, we use [Bark](https://github.com/suno-ai/bark) generative model to turn a text prompt into EnCodec audio tokens. These tokens then go through two decoders, EnCodec and Vocos, to reconstruct the audio waveform. Compare the results to discover the differences in audio quality and characteristics."
+ ],
+ "metadata": {
+ "id": "zJFDte0daDAz"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Make sure you have Bark and Vocos installed:"
+ ],
+ "metadata": {
+ "id": "c9omqGDYnajY"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install git+https://github.com/suno-ai/bark.git\n",
+ "!pip install vocos"
+ ],
+ "metadata": {
+ "id": "voH44g90NvtV"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Download and load Bark models"
+ ],
+ "metadata": {
+ "id": "s3cEjOIuj6tq"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from bark import preload_models\n",
+ "\n",
+ "preload_models()"
+ ],
+ "metadata": {
+ "id": "1H7XtXRMjxUM"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Download and load Vocos."
+ ],
+ "metadata": {
+ "id": "YO1m0dJ1j-F5"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from vocos import Vocos\n",
+ "import torch\n",
+ "\n",
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+ "vocos = Vocos.from_pretrained(\"charactr/vocos-encodec-24khz\").to(device)"
+ ],
+ "metadata": {
+ "id": "COQYTDDFkBCq"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "We are going to reuse `text_to_semantic` from Bark API, but to reconstruct audio waveform with a custom vododer, we need to slightly redefine the API to return `fine_tokens`."
+ ],
+ "metadata": {
+ "id": "--RjqW0rk5JQ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "OiUsuN2DNl5S"
+ },
+ "outputs": [],
+ "source": [
+ "from typing import Optional, Union, Dict\n",
+ "\n",
+ "import numpy as np\n",
+ "from bark.generation import generate_coarse, generate_fine\n",
+ "\n",
+ "\n",
+ "def semantic_to_audio_tokens(\n",
+ " semantic_tokens: np.ndarray,\n",
+ " history_prompt: Optional[Union[Dict, str]] = None,\n",
+ " temp: float = 0.7,\n",
+ " silent: bool = False,\n",
+ " output_full: bool = False,\n",
+ "):\n",
+ " coarse_tokens = generate_coarse(\n",
+ " semantic_tokens, history_prompt=history_prompt, temp=temp, silent=silent, use_kv_caching=True\n",
+ " )\n",
+ " fine_tokens = generate_fine(coarse_tokens, history_prompt=history_prompt, temp=0.5)\n",
+ "\n",
+ " if output_full:\n",
+ " full_generation = {\n",
+ " \"semantic_prompt\": semantic_tokens,\n",
+ " \"coarse_prompt\": coarse_tokens,\n",
+ " \"fine_prompt\": fine_tokens,\n",
+ " }\n",
+ " return full_generation\n",
+ " return fine_tokens"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Let's create a text prompt and generate audio tokens:"
+ ],
+ "metadata": {
+ "id": "Cv8KCzXlmoF9"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from bark import text_to_semantic\n",
+ "\n",
+ "history_prompt = None\n",
+ "text_prompt = \"So, you've heard about neural vocoding? [laughs] We've been messing around with this new model called Vocos.\"\n",
+ "semantic_tokens = text_to_semantic(text_prompt, history_prompt=history_prompt, temp=0.7, silent=False,)\n",
+ "audio_tokens = semantic_to_audio_tokens(\n",
+ " semantic_tokens, history_prompt=history_prompt, temp=0.7, silent=False, output_full=False,\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "pDmSTutoOH_G"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Reconstruct audio waveform with EnCodec:"
+ ],
+ "metadata": {
+ "id": "UYMzI8svTNqI"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from bark.generation import codec_decode\n",
+ "from IPython.display import Audio\n",
+ "\n",
+ "encodec_output = codec_decode(audio_tokens)\n",
+ "\n",
+ "import torchaudio\n",
+ "# Upsample to 44100 Hz for better reproduction on audio hardware\n",
+ "encodec_output = torchaudio.functional.resample(torch.from_numpy(encodec_output), orig_freq=24000, new_freq=44100)\n",
+ "Audio(encodec_output, rate=44100)"
+ ],
+ "metadata": {
+ "id": "PzdytlXFTNQ2"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Reconstruct with Vocos:"
+ ],
+ "metadata": {
+ "id": "BhUxBuP9TTTw"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "audio_tokens_torch = torch.from_numpy(audio_tokens).to(device)\n",
+ "features = vocos.codes_to_features(audio_tokens_torch)\n",
+ "vocos_output = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device)) # 6 kbps\n",
+ "# Upsample to 44100 Hz for better reproduction on audio hardware\n",
+ "vocos_output = torchaudio.functional.resample(vocos_output, orig_freq=24000, new_freq=44100).cpu()\n",
+ "Audio(vocos_output.numpy(), rate=44100)"
+ ],
+ "metadata": {
+ "id": "8hzSWQ5-nBlV"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Optionally save to mp3 files:"
+ ],
+ "metadata": {
+ "id": "RjVXQIZRb1Re"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "torchaudio.save(\"encodec.mp3\", encodec_output[None, :], 44100, compression=128)\n",
+ "torchaudio.save(\"vocos.mp3\", vocos_output, 44100, compression=128)"
+ ],
+ "metadata": {
+ "id": "PLFXpjUKb3WX"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/vocos/requirements-train.txt b/vocos/requirements-train.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acaba87b559ef0a6ee4a094be04819db3d3ea16c
--- /dev/null
+++ b/vocos/requirements-train.txt
@@ -0,0 +1,8 @@
+pytorch_lightning==1.8.6
+jsonargparse[signatures]
+transformers
+torchcrepe
+pesq
+matplotlib==3.7.0
+conda install conda-forge::sox -y
+git+https://github.com/One-sixth/fairseq.git
\ No newline at end of file
diff --git a/vocos/requirements.txt b/vocos/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..410d868c9fa517b3f939197a8eed410a51c60d44
--- /dev/null
+++ b/vocos/requirements.txt
@@ -0,0 +1,8 @@
+torch
+torchaudio
+numpy
+scipy
+einops
+pyyaml
+huggingface_hub
+encodec==0.1.1
\ No newline at end of file
diff --git a/vocos/setup.py b/vocos/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f72868564c97272e7204f09dd653ab3550a32d2
--- /dev/null
+++ b/vocos/setup.py
@@ -0,0 +1,39 @@
+import io
+import os
+
+from setuptools import find_packages, setup
+
+for line in open("vocos/__init__.py"):
+ line = line.strip()
+ if "__version__" in line:
+ context = {}
+ exec(line, context)
+ VERSION = context["__version__"]
+
+
+def read(*paths, **kwargs):
+ content = ""
+ with io.open(
+ os.path.join(os.path.dirname(__file__), *paths), encoding=kwargs.get("encoding", "utf8"),
+ ) as open_file:
+ content = open_file.read().strip()
+ return content
+
+
+def read_requirements(path):
+ return [line.strip() for line in read(path).split("\n") if not line.startswith(('"', "#", "-", "git+"))]
+
+
+setup(
+ name="vocos",
+ version=VERSION,
+ author="Hubert Siuzdak",
+ author_email="huberts@charactr.com",
+ description="Fourier-based neural vocoder for high-quality audio synthesis",
+ url="https://github.com/charactr-platform/vocos",
+ long_description=read("README.md"),
+ long_description_content_type="text/markdown",
+ packages=find_packages(),
+ install_requires=read_requirements("requirements.txt"),
+ extras_require={"train": read_requirements("requirements-train.txt")},
+)
diff --git a/vocos/train.py b/vocos/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b502883fe91cf16ecab5350e98651f865723642
--- /dev/null
+++ b/vocos/train.py
@@ -0,0 +1,6 @@
+from pytorch_lightning.cli import LightningCLI
+
+
+if __name__ == "__main__":
+ cli = LightningCLI(run=False)
+ cli.trainer.fit(model=cli.model, datamodule=cli.datamodule)
diff --git a/vocos/vocos/__init__.py b/vocos/vocos/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..928cc7120ac304638cf07650e7d088e83a83d948
--- /dev/null
+++ b/vocos/vocos/__init__.py
@@ -0,0 +1,4 @@
+from vocos.pretrained import Vocos
+
+
+__version__ = "0.1.0"
diff --git a/vocos/vocos/__pycache__/__init__.cpython-311.pyc b/vocos/vocos/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4516ec8f6f6ca717316995101f6a16841be01f32
Binary files /dev/null and b/vocos/vocos/__pycache__/__init__.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/dataset.cpython-311.pyc b/vocos/vocos/__pycache__/dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3174c398ca9a0156cafb583a74e3a55af3485de7
Binary files /dev/null and b/vocos/vocos/__pycache__/dataset.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/discriminators.cpython-311.pyc b/vocos/vocos/__pycache__/discriminators.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ad85200c398f073c553b3111c8c65e1a91184d3
Binary files /dev/null and b/vocos/vocos/__pycache__/discriminators.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/experiment.cpython-311.pyc b/vocos/vocos/__pycache__/experiment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a644a909bcd65a7b321358f7f14e6afab4105a1
Binary files /dev/null and b/vocos/vocos/__pycache__/experiment.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/feature_extractors.cpython-311.pyc b/vocos/vocos/__pycache__/feature_extractors.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f00b6835e64fe037bd65ee067331b83dee79c1f2
Binary files /dev/null and b/vocos/vocos/__pycache__/feature_extractors.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/heads.cpython-311.pyc b/vocos/vocos/__pycache__/heads.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc87b2dc60000f12dc13cfd8a3e9e9852a12c7db
Binary files /dev/null and b/vocos/vocos/__pycache__/heads.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/helpers.cpython-311.pyc b/vocos/vocos/__pycache__/helpers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86ee7c6aac3d5dcb98be42f0ff95dc376707173f
Binary files /dev/null and b/vocos/vocos/__pycache__/helpers.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/loss.cpython-311.pyc b/vocos/vocos/__pycache__/loss.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f33c461fa9f86dba220f4b6a3e45708adeaf17b
Binary files /dev/null and b/vocos/vocos/__pycache__/loss.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/models.cpython-311.pyc b/vocos/vocos/__pycache__/models.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7194431416e89ef98f8aa6278b1b090aa7ba6638
Binary files /dev/null and b/vocos/vocos/__pycache__/models.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/modules.cpython-311.pyc b/vocos/vocos/__pycache__/modules.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fb162e7bbe30526b6013cb688c71e30e9314cfc
Binary files /dev/null and b/vocos/vocos/__pycache__/modules.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/pretrained.cpython-311.pyc b/vocos/vocos/__pycache__/pretrained.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a4f7f6ed3e4082dad5847e20a4bfad58cdf9cb3
Binary files /dev/null and b/vocos/vocos/__pycache__/pretrained.cpython-311.pyc differ
diff --git a/vocos/vocos/__pycache__/spectral_ops.cpython-311.pyc b/vocos/vocos/__pycache__/spectral_ops.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a8c2b46d11247dc8302d05f8c1a8c1ee11b1635
Binary files /dev/null and b/vocos/vocos/__pycache__/spectral_ops.cpython-311.pyc differ
diff --git a/vocos/vocos/dataset.py b/vocos/vocos/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..25b3bc169531d8984056d4074dc4519d79343d24
--- /dev/null
+++ b/vocos/vocos/dataset.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+import torchaudio
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import Dataset, DataLoader
+
+torch.set_num_threads(1)
+
+
+@dataclass
+class DataConfig:
+ filelist_path: str
+ sampling_rate: int
+ num_samples: int
+ batch_size: int
+ num_workers: int
+
+
+class VocosDataModule(LightningDataModule):
+ def __init__(self, train_params: DataConfig, val_params: DataConfig):
+ super().__init__()
+ self.train_config = train_params
+ self.val_config = val_params
+
+ def _get_dataloder(self, cfg: DataConfig, train: bool):
+ dataset = VocosDataset(cfg, train=train)
+ dataloader = DataLoader(
+ dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers, shuffle=train, pin_memory=True,
+ )
+ return dataloader
+
+ def train_dataloader(self) -> DataLoader:
+ return self._get_dataloder(self.train_config, train=True)
+
+ def val_dataloader(self) -> DataLoader:
+ return self._get_dataloder(self.val_config, train=False)
+
+
+class VocosDataset(Dataset):
+ def __init__(self, cfg: DataConfig, train: bool):
+ with open(cfg.filelist_path) as f:
+ self.filelist = f.read().splitlines()
+ self.sampling_rate = cfg.sampling_rate
+ self.num_samples = cfg.num_samples
+ self.train = train
+
+ def __len__(self) -> int:
+ return len(self.filelist)
+
+ def __getitem__(self, index: int) -> torch.Tensor:
+ audio_path = self.filelist[index]
+ y, sr = torchaudio.load(audio_path)
+ if y.size(0) > 1:
+ # mix to mono
+ y = y.mean(dim=0, keepdim=True)
+ gain = np.random.uniform(-1, -6) if self.train else -3
+ y, _ = torchaudio.sox_effects.apply_effects_tensor(y, sr, [["norm", f"{gain:.2f}"]])
+ if sr != self.sampling_rate:
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=self.sampling_rate)
+ if y.size(-1) < self.num_samples:
+ pad_length = self.num_samples - y.size(-1)
+ padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
+ y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
+ elif self.train:
+ start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
+ y = y[:, start : start + self.num_samples]
+ else:
+ # During validation, take always the first segment for determinism
+ y = y[:, : self.num_samples]
+
+ return y[0]
diff --git a/vocos/vocos/discriminators.py b/vocos/vocos/discriminators.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c62574a78bc963eb35dedffd6ef13a96f4e4193
--- /dev/null
+++ b/vocos/vocos/discriminators.py
@@ -0,0 +1,211 @@
+from typing import List, Optional, Tuple
+
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import Conv2d
+from torch.nn.utils import weight_norm
+from torchaudio.transforms import Spectrogram
+
+
+class MultiPeriodDiscriminator(nn.Module):
+ """
+ Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
+ Additionally, it allows incorporating conditional information with a learned embeddings table.
+
+ Args:
+ periods (tuple[int]): Tuple of periods for each discriminator.
+ num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+ Defaults to None.
+ """
+
+ def __init__(self, periods: Tuple[int, ...] = (2, 3, 5, 7, 11), num_embeddings: Optional[int] = None):
+ super().__init__()
+ self.discriminators = nn.ModuleList([DiscriminatorP(period=p, num_embeddings=num_embeddings) for p in periods])
+
+ def forward(
+ self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: Optional[torch.Tensor] = None
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for d in self.discriminators:
+ y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+ y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+ y_d_rs.append(y_d_r)
+ fmap_rs.append(fmap_r)
+ y_d_gs.append(y_d_g)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorP(nn.Module):
+ def __init__(
+ self,
+ period: int,
+ in_channels: int = 1,
+ kernel_size: int = 5,
+ stride: int = 3,
+ lrelu_slope: float = 0.1,
+ num_embeddings: Optional[int] = None,
+ ):
+ super().__init__()
+ self.period = period
+ self.convs = nn.ModuleList(
+ [
+ weight_norm(Conv2d(in_channels, 32, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+ weight_norm(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+ weight_norm(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+ weight_norm(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+ weight_norm(Conv2d(1024, 1024, (kernel_size, 1), (1, 1), padding=(kernel_size // 2, 0))),
+ ]
+ )
+ if num_embeddings is not None:
+ self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=1024)
+ torch.nn.init.zeros_(self.emb.weight)
+
+ self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+ self.lrelu_slope = lrelu_slope
+
+ def forward(
+ self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
+ ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+ x = x.unsqueeze(1)
+ fmap = []
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for i, l in enumerate(self.convs):
+ x = l(x)
+ x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
+ if i > 0:
+ fmap.append(x)
+ if cond_embedding_id is not None:
+ emb = self.emb(cond_embedding_id)
+ h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+ else:
+ h = 0
+ x = self.conv_post(x)
+ fmap.append(x)
+ x += h
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiResolutionDiscriminator(nn.Module):
+ def __init__(
+ self,
+ fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+ num_embeddings: Optional[int] = None,
+ ):
+ """
+ Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+ Additionally, it allows incorporating conditional information with a learned embeddings table.
+
+ Args:
+ fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+ num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+ Defaults to None.
+ """
+
+ super().__init__()
+ self.discriminators = nn.ModuleList(
+ [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+ )
+
+ def forward(
+ self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+
+ for d in self.discriminators:
+ y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+ y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+ y_d_rs.append(y_d_r)
+ fmap_rs.append(fmap_r)
+ y_d_gs.append(y_d_g)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+ def __init__(
+ self,
+ window_length: int,
+ num_embeddings: Optional[int] = None,
+ channels: int = 32,
+ hop_factor: float = 0.25,
+ bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+ ):
+ super().__init__()
+ self.window_length = window_length
+ self.hop_factor = hop_factor
+ self.spec_fn = Spectrogram(
+ n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+ )
+ n_fft = window_length // 2 + 1
+ bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+ self.bands = bands
+ convs = lambda: nn.ModuleList(
+ [
+ weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+ weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+ weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+ weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+ weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+ ]
+ )
+ self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+
+ if num_embeddings is not None:
+ self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+ torch.nn.init.zeros_(self.emb.weight)
+
+ self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+
+ def spectrogram(self, x):
+ # Remove DC offset
+ x = x - x.mean(dim=-1, keepdims=True)
+ # Peak normalize the volume of input audio
+ x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+ x = self.spec_fn(x)
+ x = torch.view_as_real(x)
+ x = rearrange(x, "b f t c -> b c t f")
+ # Split into bands
+ x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+ return x_bands
+
+ def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+ x_bands = self.spectrogram(x)
+ fmap = []
+ x = []
+ for band, stack in zip(x_bands, self.band_convs):
+ for i, layer in enumerate(stack):
+ band = layer(band)
+ band = torch.nn.functional.leaky_relu(band, 0.1)
+ if i > 0:
+ fmap.append(band)
+ x.append(band)
+ x = torch.cat(x, dim=-1)
+ if cond_embedding_id is not None:
+ emb = self.emb(cond_embedding_id)
+ h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+ else:
+ h = 0
+ x = self.conv_post(x)
+ fmap.append(x)
+ x += h
+
+ return x, fmap
diff --git a/vocos/vocos/experiment.py b/vocos/vocos/experiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..191c2fb59bcc74d244cd2a54c796a0994e0c4ab3
--- /dev/null
+++ b/vocos/vocos/experiment.py
@@ -0,0 +1,371 @@
+import math
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torchaudio
+import transformers
+
+from vocos.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator
+from vocos.feature_extractors import FeatureExtractor
+from vocos.heads import FourierHead
+from vocos.helpers import plot_spectrogram_to_numpy
+from vocos.loss import DiscriminatorLoss, GeneratorLoss, FeatureMatchingLoss, MelSpecReconstructionLoss
+from vocos.models import Backbone
+from vocos.modules import safe_log
+
+
+class VocosExp(pl.LightningModule):
+ # noinspection PyUnusedLocal
+ def __init__(
+ self,
+ feature_extractor: FeatureExtractor,
+ backbone: Backbone,
+ head: FourierHead,
+ sample_rate: int,
+ initial_learning_rate: float,
+ num_warmup_steps: int = 0,
+ mel_loss_coeff: float = 45,
+ mrd_loss_coeff: float = 1.0,
+ pretrain_mel_steps: int = 0,
+ decay_mel_coeff: bool = False,
+ evaluate_utmos: bool = False,
+ evaluate_pesq: bool = False,
+ evaluate_periodicty: bool = False,
+ ):
+ """
+ Args:
+ feature_extractor (FeatureExtractor): An instance of FeatureExtractor to extract features from audio signals.
+ backbone (Backbone): An instance of Backbone model.
+ head (FourierHead): An instance of Fourier head to generate spectral coefficients and reconstruct a waveform.
+ sample_rate (int): Sampling rate of the audio signals.
+ initial_learning_rate (float): Initial learning rate for the optimizer.
+ num_warmup_steps (int): Number of steps for the warmup phase of learning rate scheduler. Default is 0.
+ mel_loss_coeff (float, optional): Coefficient for Mel-spectrogram loss in the loss function. Default is 45.
+ mrd_loss_coeff (float, optional): Coefficient for Multi Resolution Discriminator loss. Default is 1.0.
+ pretrain_mel_steps (int, optional): Number of steps to pre-train the model without the GAN objective. Default is 0.
+ decay_mel_coeff (bool, optional): If True, the Mel-spectrogram loss coefficient is decayed during training. Default is False.
+ evaluate_utmos (bool, optional): If True, UTMOS scores are computed for each validation run.
+ evaluate_pesq (bool, optional): If True, PESQ scores are computed for each validation run.
+ evaluate_periodicty (bool, optional): If True, periodicity scores are computed for each validation run.
+ """
+ super().__init__()
+ self.save_hyperparameters(ignore=["feature_extractor", "backbone", "head"])
+
+ self.feature_extractor = feature_extractor
+ self.backbone = backbone
+ self.head = head
+
+ self.multiperioddisc = MultiPeriodDiscriminator()
+ self.multiresddisc = MultiResolutionDiscriminator()
+
+ self.disc_loss = DiscriminatorLoss()
+ self.gen_loss = GeneratorLoss()
+ self.feat_matching_loss = FeatureMatchingLoss()
+ self.melspec_loss = MelSpecReconstructionLoss(sample_rate=sample_rate)
+
+ self.train_discriminator = False
+ self.base_mel_coeff = self.mel_loss_coeff = mel_loss_coeff
+
+ def configure_optimizers(self):
+ disc_params = [
+ {"params": self.multiperioddisc.parameters()},
+ {"params": self.multiresddisc.parameters()},
+ ]
+ gen_params = [
+ {"params": self.feature_extractor.parameters()},
+ {"params": self.backbone.parameters()},
+ {"params": self.head.parameters()},
+ ]
+
+ opt_disc = torch.optim.AdamW(disc_params, lr=self.hparams.initial_learning_rate, betas=(0.8, 0.9))
+ opt_gen = torch.optim.AdamW(gen_params, lr=self.hparams.initial_learning_rate, betas=(0.8, 0.9))
+
+ max_steps = self.trainer.max_steps // 2 # Max steps per optimizer
+ scheduler_disc = transformers.get_cosine_schedule_with_warmup(
+ opt_disc, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=max_steps,
+ )
+ scheduler_gen = transformers.get_cosine_schedule_with_warmup(
+ opt_gen, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=max_steps,
+ )
+
+ return (
+ [opt_disc, opt_gen],
+ [{"scheduler": scheduler_disc, "interval": "step"}, {"scheduler": scheduler_gen, "interval": "step"}],
+ )
+
+ def forward(self, audio_input, **kwargs):
+ features = self.feature_extractor(audio_input, **kwargs)
+ x = self.backbone(features, **kwargs)
+ audio_output = self.head(x)
+ return audio_output
+
+ def training_step(self, batch, batch_idx, optimizer_idx, **kwargs):
+ audio_input = batch
+
+ # train discriminator
+ if optimizer_idx == 0 and self.train_discriminator:
+ with torch.no_grad():
+ audio_hat = self(audio_input, **kwargs)
+
+ real_score_mp, gen_score_mp, _, _ = self.multiperioddisc(y=audio_input, y_hat=audio_hat, **kwargs,)
+ real_score_mrd, gen_score_mrd, _, _ = self.multiresddisc(y=audio_input, y_hat=audio_hat, **kwargs,)
+ loss_mp, loss_mp_real, _ = self.disc_loss(
+ disc_real_outputs=real_score_mp, disc_generated_outputs=gen_score_mp
+ )
+ loss_mrd, loss_mrd_real, _ = self.disc_loss(
+ disc_real_outputs=real_score_mrd, disc_generated_outputs=gen_score_mrd
+ )
+ loss_mp /= len(loss_mp_real)
+ loss_mrd /= len(loss_mrd_real)
+ loss = loss_mp + self.hparams.mrd_loss_coeff * loss_mrd
+
+ self.log("discriminator/total", loss, prog_bar=True)
+ self.log("discriminator/multi_period_loss", loss_mp)
+ self.log("discriminator/multi_res_loss", loss_mrd)
+ return loss
+
+ # train generator
+ if optimizer_idx == 1:
+ audio_hat = self(audio_input, **kwargs)
+ if self.train_discriminator:
+ _, gen_score_mp, fmap_rs_mp, fmap_gs_mp = self.multiperioddisc(
+ y=audio_input, y_hat=audio_hat, **kwargs,
+ )
+ _, gen_score_mrd, fmap_rs_mrd, fmap_gs_mrd = self.multiresddisc(
+ y=audio_input, y_hat=audio_hat, **kwargs,
+ )
+ loss_gen_mp, list_loss_gen_mp = self.gen_loss(disc_outputs=gen_score_mp)
+ loss_gen_mrd, list_loss_gen_mrd = self.gen_loss(disc_outputs=gen_score_mrd)
+ loss_gen_mp = loss_gen_mp / len(list_loss_gen_mp)
+ loss_gen_mrd = loss_gen_mrd / len(list_loss_gen_mrd)
+ loss_fm_mp = self.feat_matching_loss(fmap_r=fmap_rs_mp, fmap_g=fmap_gs_mp) / len(fmap_rs_mp)
+ loss_fm_mrd = self.feat_matching_loss(fmap_r=fmap_rs_mrd, fmap_g=fmap_gs_mrd) / len(fmap_rs_mrd)
+
+ self.log("generator/multi_period_loss", loss_gen_mp)
+ self.log("generator/multi_res_loss", loss_gen_mrd)
+ self.log("generator/feature_matching_mp", loss_fm_mp)
+ self.log("generator/feature_matching_mrd", loss_fm_mrd)
+ else:
+ loss_gen_mp = loss_gen_mrd = loss_fm_mp = loss_fm_mrd = 0
+
+ mel_loss = self.melspec_loss(audio_hat, audio_input)
+ loss = (
+ loss_gen_mp
+ + self.hparams.mrd_loss_coeff * loss_gen_mrd
+ + loss_fm_mp
+ + self.hparams.mrd_loss_coeff * loss_fm_mrd
+ + self.mel_loss_coeff * mel_loss
+ )
+
+ self.log("generator/total_loss", loss, prog_bar=True)
+ self.log("mel_loss_coeff", self.mel_loss_coeff)
+ self.log("generator/mel_loss", mel_loss)
+
+ if self.global_step % 1000 == 0 and self.global_rank == 0:
+ self.logger.experiment.add_audio(
+ "train/audio_in", audio_input[0].data.cpu(), self.global_step, self.hparams.sample_rate
+ )
+ self.logger.experiment.add_audio(
+ "train/audio_pred", audio_hat[0].data.cpu(), self.global_step, self.hparams.sample_rate
+ )
+ with torch.no_grad():
+ mel = safe_log(self.melspec_loss.mel_spec(audio_input[0]))
+ mel_hat = safe_log(self.melspec_loss.mel_spec(audio_hat[0]))
+ self.logger.experiment.add_image(
+ "train/mel_target",
+ plot_spectrogram_to_numpy(mel.data.cpu().numpy()),
+ self.global_step,
+ dataformats="HWC",
+ )
+ self.logger.experiment.add_image(
+ "train/mel_pred",
+ plot_spectrogram_to_numpy(mel_hat.data.cpu().numpy()),
+ self.global_step,
+ dataformats="HWC",
+ )
+
+ return loss
+
+ def on_validation_epoch_start(self):
+ if self.hparams.evaluate_utmos:
+ from metrics.UTMOS import UTMOSScore
+
+ if not hasattr(self, "utmos_model"):
+ self.utmos_model = UTMOSScore(device=self.device)
+
+ def validation_step(self, batch, batch_idx, **kwargs):
+ audio_input = batch
+ audio_hat = self(audio_input, **kwargs)
+
+ audio_16_khz = torchaudio.functional.resample(audio_input, orig_freq=self.hparams.sample_rate, new_freq=16000)
+ audio_hat_16khz = torchaudio.functional.resample(audio_hat, orig_freq=self.hparams.sample_rate, new_freq=16000)
+
+ if self.hparams.evaluate_periodicty:
+ from metrics.periodicity import calculate_periodicity_metrics
+
+ periodicity_loss, pitch_loss, f1_score = calculate_periodicity_metrics(audio_16_khz, audio_hat_16khz)
+ else:
+ periodicity_loss = pitch_loss = f1_score = 0
+
+ if self.hparams.evaluate_utmos:
+ utmos_score = self.utmos_model.score(audio_hat_16khz.unsqueeze(1)).mean()
+ else:
+ utmos_score = torch.zeros(1, device=self.device)
+
+ if self.hparams.evaluate_pesq:
+ from pesq import pesq
+
+ pesq_score = 0
+ for ref, deg in zip(audio_16_khz.cpu().numpy(), audio_hat_16khz.cpu().numpy()):
+ pesq_score += pesq(16000, ref, deg, "wb", on_error=1)
+ pesq_score /= len(audio_16_khz)
+ pesq_score = torch.tensor(pesq_score)
+ else:
+ pesq_score = torch.zeros(1, device=self.device)
+
+ mel_loss = self.melspec_loss(audio_hat.unsqueeze(1), audio_input.unsqueeze(1))
+ total_loss = mel_loss + (5 - utmos_score) + (5 - pesq_score)
+
+ return {
+ "val_loss": total_loss,
+ "mel_loss": mel_loss,
+ "utmos_score": utmos_score,
+ "pesq_score": pesq_score,
+ "periodicity_loss": periodicity_loss,
+ "pitch_loss": pitch_loss,
+ "f1_score": f1_score,
+ "audio_input": audio_input[0],
+ "audio_pred": audio_hat[0],
+ }
+
+ def validation_epoch_end(self, outputs):
+ if self.global_rank == 0:
+ *_, audio_in, audio_pred = outputs[0].values()
+ self.logger.experiment.add_audio(
+ "val_in", audio_in.data.cpu().numpy(), self.global_step, self.hparams.sample_rate
+ )
+ self.logger.experiment.add_audio(
+ "val_pred", audio_pred.data.cpu().numpy(), self.global_step, self.hparams.sample_rate
+ )
+ mel_target = safe_log(self.melspec_loss.mel_spec(audio_in))
+ mel_hat = safe_log(self.melspec_loss.mel_spec(audio_pred))
+ self.logger.experiment.add_image(
+ "val_mel_target",
+ plot_spectrogram_to_numpy(mel_target.data.cpu().numpy()),
+ self.global_step,
+ dataformats="HWC",
+ )
+ self.logger.experiment.add_image(
+ "val_mel_hat",
+ plot_spectrogram_to_numpy(mel_hat.data.cpu().numpy()),
+ self.global_step,
+ dataformats="HWC",
+ )
+ avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
+ mel_loss = torch.stack([x["mel_loss"] for x in outputs]).mean()
+ utmos_score = torch.stack([x["utmos_score"] for x in outputs]).mean()
+ pesq_score = torch.stack([x["pesq_score"] for x in outputs]).mean()
+ periodicity_loss = np.array([x["periodicity_loss"] for x in outputs]).mean()
+ pitch_loss = np.array([x["pitch_loss"] for x in outputs]).mean()
+ f1_score = np.array([x["f1_score"] for x in outputs]).mean()
+
+ self.log("val_loss", avg_loss, sync_dist=True)
+ self.log("val/mel_loss", mel_loss, sync_dist=True)
+ self.log("val/utmos_score", utmos_score, sync_dist=True)
+ self.log("val/pesq_score", pesq_score, sync_dist=True)
+ self.log("val/periodicity_loss", periodicity_loss, sync_dist=True)
+ self.log("val/pitch_loss", pitch_loss, sync_dist=True)
+ self.log("val/f1_score", f1_score, sync_dist=True)
+
+ @property
+ def global_step(self):
+ """
+ Override global_step so that it returns the total number of batches processed
+ """
+ return self.trainer.fit_loop.epoch_loop.total_batch_idx
+
+ def on_train_batch_start(self, *args):
+ if self.global_step >= self.hparams.pretrain_mel_steps:
+ self.train_discriminator = True
+ else:
+ self.train_discriminator = False
+
+ def on_train_batch_end(self, *args):
+ def mel_loss_coeff_decay(current_step, num_cycles=0.5):
+ max_steps = self.trainer.max_steps // 2
+ if current_step < self.hparams.num_warmup_steps:
+ return 1.0
+ progress = float(current_step - self.hparams.num_warmup_steps) / float(
+ max(1, max_steps - self.hparams.num_warmup_steps)
+ )
+ return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+ if self.hparams.decay_mel_coeff:
+ self.mel_loss_coeff = self.base_mel_coeff * mel_loss_coeff_decay(self.global_step + 1)
+
+
+class VocosEncodecExp(VocosExp):
+ """
+ VocosEncodecExp is a subclass of VocosExp that overrides the parent experiment to function as a conditional GAN.
+ It manages an additional `bandwidth_id` attribute, which denotes a learnable embedding corresponding to
+ a specific bandwidth value of EnCodec. During training, a random bandwidth_id is generated for each step,
+ while during validation, a fixed bandwidth_id is used.
+ """
+
+ def __init__(
+ self,
+ feature_extractor: FeatureExtractor,
+ backbone: Backbone,
+ head: FourierHead,
+ sample_rate: int,
+ initial_learning_rate: float,
+ num_warmup_steps: int,
+ mel_loss_coeff: float = 45,
+ mrd_loss_coeff: float = 1.0,
+ pretrain_mel_steps: int = 0,
+ decay_mel_coeff: bool = False,
+ evaluate_utmos: bool = False,
+ evaluate_pesq: bool = False,
+ evaluate_periodicty: bool = False,
+ ):
+ super().__init__(
+ feature_extractor,
+ backbone,
+ head,
+ sample_rate,
+ initial_learning_rate,
+ num_warmup_steps,
+ mel_loss_coeff,
+ mrd_loss_coeff,
+ pretrain_mel_steps,
+ decay_mel_coeff,
+ evaluate_utmos,
+ evaluate_pesq,
+ evaluate_periodicty,
+ )
+ # Override with conditional discriminators
+ self.multiperioddisc = MultiPeriodDiscriminator(num_embeddings=len(self.feature_extractor.bandwidths))
+ self.multiresddisc = MultiResolutionDiscriminator(num_embeddings=len(self.feature_extractor.bandwidths))
+
+ def training_step(self, *args):
+ bandwidth_id = torch.randint(low=0, high=len(self.feature_extractor.bandwidths), size=(1,), device=self.device,)
+ output = super().training_step(*args, bandwidth_id=bandwidth_id)
+ return output
+
+ def validation_step(self, *args):
+ bandwidth_id = torch.tensor([0], device=self.device)
+ output = super().validation_step(*args, bandwidth_id=bandwidth_id)
+ return output
+
+ def validation_epoch_end(self, outputs):
+ if self.global_rank == 0:
+ *_, audio_in, _ = outputs[0].values()
+ # Resynthesis with encodec for reference
+ self.feature_extractor.encodec.set_target_bandwidth(self.feature_extractor.bandwidths[0])
+ encodec_audio = self.feature_extractor.encodec(audio_in[None, None, :])
+ self.logger.experiment.add_audio(
+ "encodec", encodec_audio[0, 0].data.cpu().numpy(), self.global_step, self.hparams.sample_rate,
+ )
+
+ super().validation_epoch_end(outputs)
diff --git a/vocos/vocos/feature_extractors.py b/vocos/vocos/feature_extractors.py
new file mode 100644
index 0000000000000000000000000000000000000000..c22d97654a948d2e5c9a2c6f83e1663a2cec8b1c
--- /dev/null
+++ b/vocos/vocos/feature_extractors.py
@@ -0,0 +1,120 @@
+from typing import List
+
+import torch
+import torchaudio
+from encodec import EncodecModel
+from torch import nn
+
+from vocos.modules import safe_log
+
+
+class FeatureExtractor(nn.Module):
+ """Base class for feature extractors."""
+
+ def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
+ """
+ Extract features from the given audio.
+
+ Args:
+ audio (Tensor): Input audio waveform.
+
+ Returns:
+ Tensor: Extracted features of shape (B, C, L), where B is the batch size,
+ C denotes output features, and L is the sequence length.
+ """
+ raise NotImplementedError("Subclasses must implement the forward method.")
+
+
+class MelSpectrogramFeatures(FeatureExtractor):
+ def __init__(self, sample_rate=44100, n_fft=2048, hop_length=512, win_length=2048, n_mels=128, padding="center"):
+ super().__init__()
+ if padding not in ["center", "same"]:
+ raise ValueError("Padding must be 'center' or 'same'.")
+ self.padding = padding
+ self.mel_spec = torchaudio.transforms.MelSpectrogram(
+ sample_rate=sample_rate,
+ n_fft=n_fft,
+ win_length=win_length,
+ hop_length=hop_length,
+ n_mels=n_mels,
+ center=padding == "center",
+ # padding = "reflect",
+ power=2,
+ )
+
+
+# class MelSpectrogramFeatures(FeatureExtractor):
+# def __init__(self, sample_rate=44100, n_fft=2048, hop_length=512, win_length=2048, n_mels=128, padding="center"):
+# super().__init__()
+# if padding not in ["center", "same"]:
+# raise ValueError("Padding must be 'center' or 'same'.")
+# self.padding = padding
+# self.mel_spec = torchaudio.transforms.MelSpectrogram(
+# sample_rate=16000,
+# n_fft=2048,
+# win_length=1200,
+# hop_length=300,
+# n_mels=80,
+# center=padding == "center",
+# # padding = "reflect",
+# power=2,
+# )
+
+
+ def forward(self, audio, **kwargs):
+ if self.padding == "same":
+ pad = self.mel_spec.win_length - self.mel_spec.hop_length
+ audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
+ mel = self.mel_spec(audio)
+ features = safe_log(mel)
+ return features
+
+
+class EncodecFeatures(FeatureExtractor):
+ def __init__(
+ self,
+ encodec_model: str = "encodec_24khz",
+ bandwidths: List[float] = [1.5, 3.0, 6.0, 12.0],
+ train_codebooks: bool = False,
+ ):
+ super().__init__()
+ if encodec_model == "encodec_24khz":
+ encodec = EncodecModel.encodec_model_24khz
+ elif encodec_model == "encodec_48khz":
+ encodec = EncodecModel.encodec_model_48khz
+ else:
+ raise ValueError(
+ f"Unsupported encodec_model: {encodec_model}. Supported options are 'encodec_24khz' and 'encodec_48khz'."
+ )
+ self.encodec = encodec(pretrained=True)
+ for param in self.encodec.parameters():
+ param.requires_grad = False
+ self.num_q = self.encodec.quantizer.get_num_quantizers_for_bandwidth(
+ self.encodec.frame_rate, bandwidth=max(bandwidths)
+ )
+ codebook_weights = torch.cat([vq.codebook for vq in self.encodec.quantizer.vq.layers[: self.num_q]], dim=0)
+ self.codebook_weights = torch.nn.Parameter(codebook_weights, requires_grad=train_codebooks)
+ self.bandwidths = bandwidths
+
+ @torch.no_grad()
+ def get_encodec_codes(self, audio):
+ audio = audio.unsqueeze(1)
+ emb = self.encodec.encoder(audio)
+ codes = self.encodec.quantizer.encode(emb, self.encodec.frame_rate, self.encodec.bandwidth)
+ return codes
+
+ def forward(self, audio: torch.Tensor, **kwargs):
+ bandwidth_id = kwargs.get("bandwidth_id")
+ if bandwidth_id is None:
+ raise ValueError("The 'bandwidth_id' argument is required")
+ self.encodec.eval() # Force eval mode as Pytorch Lightning automatically sets child modules to training mode
+ self.encodec.set_target_bandwidth(self.bandwidths[bandwidth_id])
+ codes = self.get_encodec_codes(audio)
+ # Instead of summing in the loop, it stores subsequent VQ dictionaries in a single `self.codebook_weights`
+ # with offsets given by the number of bins, and finally summed in a vectorized operation.
+ offsets = torch.arange(
+ 0, self.encodec.quantizer.bins * len(codes), self.encodec.quantizer.bins, device=audio.device
+ )
+ embeddings_idxs = codes + offsets.view(-1, 1, 1)
+ features = torch.nn.functional.embedding(embeddings_idxs, self.codebook_weights).sum(dim=0)
+ return features.transpose(1, 2)
diff --git a/vocos/vocos/heads.py b/vocos/vocos/heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f5cfc5d728af1d3b3928d857bf75a4935ea195
--- /dev/null
+++ b/vocos/vocos/heads.py
@@ -0,0 +1,164 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz
+
+from vocos.spectral_ops import IMDCT, ISTFT
+from vocos.modules import symexp
+
+
+class FourierHead(nn.Module):
+ """Base class for inverse fourier modules."""
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+ L is the sequence length, and H denotes the model dimension.
+
+ Returns:
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+ """
+ raise NotImplementedError("Subclasses must implement the forward method.")
+
+
+class ISTFTHead(FourierHead):
+ """
+ ISTFT Head module for predicting STFT complex coefficients.
+
+ Args:
+ dim (int): Hidden dimension of the model.
+ n_fft (int): Size of Fourier transform.
+ hop_length (int): The distance between neighboring sliding window frames, which should align with
+ the resolution of the input features.
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+ """
+
+ def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+ super().__init__()
+ out_dim = n_fft + 2
+ self.out = torch.nn.Linear(dim, out_dim)
+ self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Forward pass of the ISTFTHead module.
+
+ Args:
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+ L is the sequence length, and H denotes the model dimension.
+
+ Returns:
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+ """
+ x = self.out(x).transpose(1, 2)
+ mag, p = x.chunk(2, dim=1)
+ mag = torch.exp(mag)
+ mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes
+ # wrapping happens here. These two lines produce real and imaginary value
+ x = torch.cos(p)
+ y = torch.sin(p)
+ # recalculating phase here does not produce anything new
+ # only costs time
+ # phase = torch.atan2(y, x)
+ # S = mag * torch.exp(phase * 1j)
+ # better directly produce the complex value
+ S = mag * (x + 1j * y)
+ audio = self.istft(S)
+ return audio
+
+
+class IMDCTSymExpHead(FourierHead):
+ """
+ IMDCT Head module for predicting MDCT coefficients with symmetric exponential function
+
+ Args:
+ dim (int): Hidden dimension of the model.
+ mdct_frame_len (int): Length of the MDCT frame.
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+ sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized
+ based on perceptual scaling. Defaults to None.
+ clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ mdct_frame_len: int,
+ padding: str = "same",
+ sample_rate: Optional[int] = None,
+ clip_audio: bool = False,
+ ):
+ super().__init__()
+ out_dim = mdct_frame_len // 2
+ self.out = nn.Linear(dim, out_dim)
+ self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
+ self.clip_audio = clip_audio
+
+ if sample_rate is not None:
+ # optionally init the last layer following mel-scale
+ m_max = _hz_to_mel(sample_rate // 2)
+ m_pts = torch.linspace(0, m_max, out_dim)
+ f_pts = _mel_to_hz(m_pts)
+ scale = 1 - (f_pts / f_pts.max())
+
+ with torch.no_grad():
+ self.out.weight.mul_(scale.view(-1, 1))
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Forward pass of the IMDCTSymExpHead module.
+
+ Args:
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+ L is the sequence length, and H denotes the model dimension.
+
+ Returns:
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+ """
+ x = self.out(x)
+ x = symexp(x)
+ x = torch.clip(x, min=-1e2, max=1e2) # safeguard to prevent excessively large magnitudes
+ audio = self.imdct(x)
+ if self.clip_audio:
+ audio = torch.clip(x, min=-1.0, max=1.0)
+
+ return audio
+
+
+class IMDCTCosHead(FourierHead):
+ """
+ IMDCT Head module for predicting MDCT coefficients with parametrizing MDCT = exp(m) · cos(p)
+
+ Args:
+ dim (int): Hidden dimension of the model.
+ mdct_frame_len (int): Length of the MDCT frame.
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+ clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
+ """
+
+ def __init__(self, dim: int, mdct_frame_len: int, padding: str = "same", clip_audio: bool = False):
+ super().__init__()
+ self.clip_audio = clip_audio
+ self.out = nn.Linear(dim, mdct_frame_len)
+ self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Forward pass of the IMDCTCosHead module.
+
+ Args:
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+ L is the sequence length, and H denotes the model dimension.
+
+ Returns:
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+ """
+ x = self.out(x)
+ m, p = x.chunk(2, dim=2)
+ m = torch.exp(m).clip(max=1e2) # safeguard to prevent excessively large magnitudes
+ audio = self.imdct(m * torch.cos(p))
+ if self.clip_audio:
+ audio = torch.clip(x, min=-1.0, max=1.0)
+ return audio
diff --git a/vocos/vocos/helpers.py b/vocos/vocos/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d303010352ad59dde2996605f124128ee17db36
--- /dev/null
+++ b/vocos/vocos/helpers.py
@@ -0,0 +1,71 @@
+import matplotlib
+import numpy as np
+import torch
+from matplotlib import pyplot as plt
+from pytorch_lightning import Callback
+
+matplotlib.use("Agg")
+
+
+def save_figure_to_numpy(fig: plt.Figure) -> np.ndarray:
+ """
+ Save a matplotlib figure to a numpy array.
+
+ Args:
+ fig (Figure): Matplotlib figure object.
+
+ Returns:
+ ndarray: Numpy array representing the figure.
+ """
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ return data
+
+
+def plot_spectrogram_to_numpy(spectrogram: np.ndarray) -> np.ndarray:
+ """
+ Plot a spectrogram and convert it to a numpy array.
+
+ Args:
+ spectrogram (ndarray): Spectrogram data.
+
+ Returns:
+ ndarray: Numpy array representing the plotted spectrogram.
+ """
+ spectrogram = spectrogram.astype(np.float32)
+ fig, ax = plt.subplots(figsize=(12, 3))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+ plt.colorbar(im, ax=ax)
+ plt.xlabel("Frames")
+ plt.ylabel("Channels")
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = save_figure_to_numpy(fig)
+ plt.close()
+ return data
+
+
+class GradNormCallback(Callback):
+ """
+ Callback to log the gradient norm.
+ """
+
+ def on_after_backward(self, trainer, model):
+ model.log("grad_norm", gradient_norm(model))
+
+
+def gradient_norm(model: torch.nn.Module, norm_type: float = 2.0) -> torch.Tensor:
+ """
+ Compute the gradient norm.
+
+ Args:
+ model (Module): PyTorch model.
+ norm_type (float, optional): Type of the norm. Defaults to 2.0.
+
+ Returns:
+ Tensor: Gradient norm.
+ """
+ grads = [p.grad for p in model.parameters() if p.grad is not None]
+ total_norm = torch.norm(torch.stack([torch.norm(g.detach(), norm_type) for g in grads]), norm_type)
+ return total_norm
diff --git a/vocos/vocos/loss.py b/vocos/vocos/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..095d3247df3b66ff9a74ca3258a9aa3ca22be5f1
--- /dev/null
+++ b/vocos/vocos/loss.py
@@ -0,0 +1,118 @@
+from typing import List, Tuple
+
+import torch
+import torchaudio
+from torch import nn
+
+from vocos.modules import safe_log
+
+
+class MelSpecReconstructionLoss(nn.Module):
+ """
+ L1 distance between the mel-scaled magnitude spectrograms of the ground truth sample and the generated sample
+ """
+
+ def __init__(
+ self, sample_rate: int = 44100, n_fft: int = 2048, hop_length: int = 512, n_mels: int = 128,
+ ):
+ super().__init__()
+ # self.mel_spec = torchaudio.transforms.MelSpectrogram(
+ # sample_rate=16_000, n_fft=n_fft, win_length=n_fft, hop_length=hop_length, n_mels=n_mels, center=True, power=2,
+ # )
+
+ # self.mel_spec = torchaudio.transforms.MelSpectrogram(
+ # sample_rate=16_000, n_fft=2048, win_length=1200, hop_length=300, n_mels=80, center=True, power=2,
+ # )
+
+ def forward(self, y_hat, y) -> torch.Tensor:
+ """
+ Args:
+ y_hat (Tensor): Predicted audio waveform.
+ y (Tensor): Ground truth audio waveform.
+
+ Returns:
+ Tensor: L1 loss between the mel-scaled magnitude spectrograms.
+ """
+ mel_hat = safe_log(self.mel_spec(y_hat))
+ mel = safe_log(self.mel_spec(y))
+
+ loss = torch.nn.functional.l1_loss(mel, mel_hat)
+
+ return loss
+
+
+class GeneratorLoss(nn.Module):
+ """
+ Generator Loss module. Calculates the loss for the generator based on discriminator outputs.
+ """
+
+ def forward(self, disc_outputs: List[torch.Tensor]) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+ """
+ Args:
+ disc_outputs (List[Tensor]): List of discriminator outputs.
+
+ Returns:
+ Tuple[Tensor, List[Tensor]]: Tuple containing the total loss and a list of loss values from
+ the sub-discriminators
+ """
+ loss = torch.zeros(1, device=disc_outputs[0].device, dtype=disc_outputs[0].dtype)
+ gen_losses = []
+ for dg in disc_outputs:
+ l = torch.mean(torch.clamp(1 - dg, min=0))
+ gen_losses.append(l)
+ loss += l
+
+ return loss, gen_losses
+
+
+class DiscriminatorLoss(nn.Module):
+ """
+ Discriminator Loss module. Calculates the loss for the discriminator based on real and generated outputs.
+ """
+
+ def forward(
+ self, disc_real_outputs: List[torch.Tensor], disc_generated_outputs: List[torch.Tensor]
+ ) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]:
+ """
+ Args:
+ disc_real_outputs (List[Tensor]): List of discriminator outputs for real samples.
+ disc_generated_outputs (List[Tensor]): List of discriminator outputs for generated samples.
+
+ Returns:
+ Tuple[Tensor, List[Tensor], List[Tensor]]: A tuple containing the total loss, a list of loss values from
+ the sub-discriminators for real outputs, and a list of
+ loss values for generated outputs.
+ """
+ loss = torch.zeros(1, device=disc_real_outputs[0].device, dtype=disc_real_outputs[0].dtype)
+ r_losses = []
+ g_losses = []
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+ r_loss = torch.mean(torch.clamp(1 - dr, min=0))
+ g_loss = torch.mean(torch.clamp(1 + dg, min=0))
+ loss += r_loss + g_loss
+ r_losses.append(r_loss)
+ g_losses.append(g_loss)
+
+ return loss, r_losses, g_losses
+
+
+class FeatureMatchingLoss(nn.Module):
+ """
+ Feature Matching Loss module. Calculates the feature matching loss between feature maps of the sub-discriminators.
+ """
+
+ def forward(self, fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]) -> torch.Tensor:
+ """
+ Args:
+ fmap_r (List[List[Tensor]]): List of feature maps from real samples.
+ fmap_g (List[List[Tensor]]): List of feature maps from generated samples.
+
+ Returns:
+ Tensor: The calculated feature matching loss.
+ """
+ loss = torch.zeros(1, device=fmap_r[0][0].device, dtype=fmap_r[0][0].dtype)
+ for dr, dg in zip(fmap_r, fmap_g):
+ for rl, gl in zip(dr, dg):
+ loss += torch.mean(torch.abs(rl - gl))
+
+ return loss
diff --git a/vocos/vocos/models.py b/vocos/vocos/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ed55ddb111f47bb71f220834083a7b0643a4df
--- /dev/null
+++ b/vocos/vocos/models.py
@@ -0,0 +1,118 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.nn.utils import weight_norm
+
+from vocos.modules import ConvNeXtBlock, ResBlock1, AdaLayerNorm
+
+
+class Backbone(nn.Module):
+ """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
+
+ def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+ """
+ Args:
+ x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
+ C denotes output features, and L is the sequence length.
+
+ Returns:
+ Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
+ and H denotes the model dimension.
+ """
+ raise NotImplementedError("Subclasses must implement the forward method.")
+
+
+class VocosBackbone(Backbone):
+ """
+ Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
+
+ Args:
+ input_channels (int): Number of input features channels.
+ dim (int): Hidden dimension of the model.
+ intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
+ num_layers (int): Number of ConvNeXtBlock layers.
+ layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
+ adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+ None means non-conditional model. Defaults to None.
+ """
+
+ def __init__(
+ self,
+ input_channels: int,
+ dim: int,
+ intermediate_dim: int,
+ num_layers: int,
+ layer_scale_init_value: Optional[float] = None,
+ adanorm_num_embeddings: Optional[int] = None,
+ ):
+ super().__init__()
+ self.input_channels = input_channels
+ self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3)
+ self.adanorm = adanorm_num_embeddings is not None
+ if adanorm_num_embeddings:
+ self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+ else:
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
+ layer_scale_init_value = layer_scale_init_value or 1 / num_layers
+ self.convnext = nn.ModuleList(
+ [
+ ConvNeXtBlock(
+ dim=dim,
+ intermediate_dim=intermediate_dim,
+ layer_scale_init_value=layer_scale_init_value,
+ adanorm_num_embeddings=adanorm_num_embeddings,
+ )
+ for _ in range(num_layers)
+ ]
+ )
+ self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, (nn.Conv1d, nn.Linear)):
+ nn.init.trunc_normal_(m.weight, std=0.02)
+ nn.init.constant_(m.bias, 0)
+
+ def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+ bandwidth_id = kwargs.get('bandwidth_id', None)
+ x = self.embed(x)
+ if self.adanorm:
+ assert bandwidth_id is not None
+ x = self.norm(x.transpose(1, 2), cond_embedding_id=bandwidth_id)
+ else:
+ x = self.norm(x.transpose(1, 2))
+ x = x.transpose(1, 2)
+ for conv_block in self.convnext:
+ x = conv_block(x, cond_embedding_id=bandwidth_id)
+ x = self.final_layer_norm(x.transpose(1, 2))
+ return x
+
+
+class VocosResNetBackbone(Backbone):
+ """
+ Vocos backbone module built with ResBlocks.
+
+ Args:
+ input_channels (int): Number of input features channels.
+ dim (int): Hidden dimension of the model.
+ num_blocks (int): Number of ResBlock1 blocks.
+ layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None.
+ """
+
+ def __init__(
+ self, input_channels, dim, num_blocks, layer_scale_init_value=None,
+ ):
+ super().__init__()
+ self.input_channels = input_channels
+ self.embed = weight_norm(nn.Conv1d(input_channels, dim, kernel_size=3, padding=1))
+ layer_scale_init_value = layer_scale_init_value or 1 / num_blocks / 3
+ self.resnet = nn.Sequential(
+ *[ResBlock1(dim=dim, layer_scale_init_value=layer_scale_init_value) for _ in range(num_blocks)]
+ )
+
+ def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+ x = self.embed(x)
+ x = self.resnet(x)
+ x = x.transpose(1, 2)
+ return x
diff --git a/vocos/vocos/modules.py b/vocos/vocos/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20422d3bf3e703a509d1524aa29b9146e7a244d
--- /dev/null
+++ b/vocos/vocos/modules.py
@@ -0,0 +1,232 @@
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+
+class ConvNeXtBlock(nn.Module):
+ """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+ Args:
+ dim (int): Number of input channels.
+ intermediate_dim (int): Dimensionality of the intermediate layer.
+ layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+ Defaults to None.
+ adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+ None means non-conditional LayerNorm. Defaults to None.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ intermediate_dim: int,
+ layer_scale_init_value: float,
+ adanorm_num_embeddings: Optional[int] = None,
+ ):
+ super().__init__()
+ self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+ self.adanorm = adanorm_num_embeddings is not None
+ if adanorm_num_embeddings:
+ self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+ else:
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
+ self.act = nn.GELU()
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
+ self.gamma = (
+ nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+ if layer_scale_init_value > 0
+ else None
+ )
+
+ def forward(self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None) -> torch.Tensor:
+ residual = x
+ x = self.dwconv(x)
+ x = x.transpose(1, 2) # (B, C, T) -> (B, T, C)
+ if self.adanorm:
+ assert cond_embedding_id is not None
+ x = self.norm(x, cond_embedding_id)
+ else:
+ x = self.norm(x)
+ x = self.pwconv1(x)
+ x = self.act(x)
+ x = self.pwconv2(x)
+ if self.gamma is not None:
+ x = self.gamma * x
+ x = x.transpose(1, 2) # (B, T, C) -> (B, C, T)
+
+ x = residual + x
+ return x
+
+
+class AdaLayerNorm(nn.Module):
+ """
+ Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes
+
+ Args:
+ num_embeddings (int): Number of embeddings.
+ embedding_dim (int): Dimension of the embeddings.
+ """
+
+ def __init__(self, num_embeddings: int, embedding_dim: int, eps: float = 1e-6):
+ super().__init__()
+ self.eps = eps
+ self.dim = embedding_dim
+ self.scale = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+ self.shift = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+ torch.nn.init.ones_(self.scale.weight)
+ torch.nn.init.zeros_(self.shift.weight)
+
+ def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor) -> torch.Tensor:
+ scale = self.scale(cond_embedding_id)
+ shift = self.shift(cond_embedding_id)
+ x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps)
+ x = x * scale + shift
+ return x
+
+
+class ResBlock1(nn.Module):
+ """
+ ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
+ but without upsampling layers.
+
+ Args:
+ dim (int): Number of input channels.
+ kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
+ dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
+ Defaults to (1, 3, 5).
+ lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
+ Defaults to 0.1.
+ layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+ Defaults to None.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ kernel_size: int = 3,
+ dilation: Tuple[int, int, int] = (1, 3, 5),
+ lrelu_slope: float = 0.1,
+ layer_scale_init_value: Optional[float] = None,
+ ):
+ super().__init__()
+ self.lrelu_slope = lrelu_slope
+ self.convs1 = nn.ModuleList(
+ [
+ weight_norm(
+ nn.Conv1d(
+ dim,
+ dim,
+ kernel_size,
+ 1,
+ dilation=dilation[0],
+ padding=self.get_padding(kernel_size, dilation[0]),
+ )
+ ),
+ weight_norm(
+ nn.Conv1d(
+ dim,
+ dim,
+ kernel_size,
+ 1,
+ dilation=dilation[1],
+ padding=self.get_padding(kernel_size, dilation[1]),
+ )
+ ),
+ weight_norm(
+ nn.Conv1d(
+ dim,
+ dim,
+ kernel_size,
+ 1,
+ dilation=dilation[2],
+ padding=self.get_padding(kernel_size, dilation[2]),
+ )
+ ),
+ ]
+ )
+
+ self.convs2 = nn.ModuleList(
+ [
+ weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))),
+ weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))),
+ weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))),
+ ]
+ )
+
+ self.gamma = nn.ParameterList(
+ [
+ nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True)
+ if layer_scale_init_value is not None
+ else None,
+ nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True)
+ if layer_scale_init_value is not None
+ else None,
+ nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True)
+ if layer_scale_init_value is not None
+ else None,
+ ]
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma):
+ xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope)
+ xt = c1(xt)
+ xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope)
+ xt = c2(xt)
+ if gamma is not None:
+ xt = gamma * xt
+ x = xt + x
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+ @staticmethod
+ def get_padding(kernel_size: int, dilation: int = 1) -> int:
+ return int((kernel_size * dilation - dilation) / 2)
+
+
+# def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
+# """
+# Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
+
+# Args:
+# x (Tensor): Input tensor.
+# clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
+
+# Returns:
+# Tensor: Element-wise logarithm of the input tensor with clipping applied.
+# """
+
+
+# return torch.log(torch.clip(x, min=clip_val))
+
+
+def safe_log(x: torch.Tensor) -> torch.Tensor:
+ """
+ Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
+
+ Args:
+ x (Tensor): Input tensor.
+ clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
+
+ Returns:
+ Tensor: Element-wise logarithm of the input tensor with clipping applied.
+ """
+
+ mean, std = -4, 4
+
+ return (torch.log(1e-5 + x) - mean) / std
+
+
+def symlog(x: torch.Tensor) -> torch.Tensor:
+ return torch.sign(x) * torch.log1p(x.abs())
+
+
+def symexp(x: torch.Tensor) -> torch.Tensor:
+ return torch.sign(x) * (torch.exp(x.abs()) - 1)
diff --git a/vocos/vocos/pretrained.py b/vocos/vocos/pretrained.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfa3f5dbb0fc6d2295f82496f129470a8de02cab
--- /dev/null
+++ b/vocos/vocos/pretrained.py
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Tuple, Union, Optional
+
+import torch
+import yaml
+from huggingface_hub import hf_hub_download
+from torch import nn
+from vocos.feature_extractors import FeatureExtractor, EncodecFeatures
+from vocos.heads import FourierHead
+from vocos.models import Backbone
+
+
+def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any:
+ """Instantiates a class with the given args and init.
+
+ Args:
+ args: Positional arguments required for instantiation.
+ init: Dict of the form {"class_path":...,"init_args":...}.
+
+ Returns:
+ The instantiated class object.
+ """
+ kwargs = init.get("init_args", {})
+ if not isinstance(args, tuple):
+ args = (args,)
+ class_module, class_name = init["class_path"].rsplit(".", 1)
+ module = __import__(class_module, fromlist=[class_name])
+ args_class = getattr(module, class_name)
+ return args_class(*args, **kwargs)
+
+
+class Vocos(nn.Module):
+ """
+ The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
+ This class is primarily designed for inference, with support for loading from pretrained
+ model checkpoints. It consists of three main components: a feature extractor,
+ a backbone, and a head.
+ """
+
+ def __init__(
+ self, feature_extractor: FeatureExtractor, backbone: Backbone, head: FourierHead,
+ ):
+ super().__init__()
+ self.feature_extractor = feature_extractor
+ self.backbone = backbone
+ self.head = head
+
+ @classmethod
+ def from_hparams(cls, config_path):
+ with open(config_path, "r") as f:
+ config = yaml.safe_load(f)
+
+ # Access the nested structure
+ model_config = config["model"]["init_args"]
+
+ feature_extractor = instantiate_class(args=(), init=model_config["feature_extractor"])
+ backbone = instantiate_class(args=(), init=model_config["backbone"])
+ head = instantiate_class(args=(), init=model_config["head"])
+
+ # Assuming the Vocos class has an __init__ method that takes these components
+ return cls(feature_extractor, backbone, head)
+
+
+ @classmethod
+ def from_pretrained(cls, repo_id: str, revision: Optional[str] = None) -> Vocos:
+ """
+ Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub.
+ """
+ config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml", revision=revision)
+ model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", revision=revision)
+ model = cls.from_hparams(config_path)
+ state_dict = torch.load(model_path, map_location="cpu")
+ if isinstance(model.feature_extractor, EncodecFeatures):
+ encodec_parameters = {
+ "feature_extractor.encodec." + key: value
+ for key, value in model.feature_extractor.encodec.state_dict().items()
+ }
+ state_dict.update(encodec_parameters)
+ model.load_state_dict(state_dict)
+ model.eval()
+ return model
+
+ @torch.inference_mode()
+ def forward(self, audio_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+ """
+ Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
+ which is then passed through the backbone and the head to reconstruct the audio output.
+
+ Args:
+ audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
+ where B is the batch size and L is the waveform length.
+
+
+ Returns:
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
+ """
+ features = self.feature_extractor(audio_input, **kwargs)
+ audio_output = self.decode(features, **kwargs)
+ return audio_output
+
+ @torch.inference_mode()
+ def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+ """
+ Method to decode audio waveform from already calculated features. The features input is passed through
+ the backbone and the head to reconstruct the audio output.
+
+ Args:
+ features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
+ C denotes the feature dimension, and L is the sequence length.
+
+ Returns:
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
+ """
+ x = self.backbone(features_input, **kwargs)
+ audio_output = self.head(x)
+ return audio_output
+
+ @torch.inference_mode()
+ def codes_to_features(self, codes: torch.Tensor) -> torch.Tensor:
+ """
+ Transforms an input sequence of discrete tokens (codes) into feature embeddings using the feature extractor's
+ codebook weights.
+
+ Args:
+ codes (Tensor): The input tensor. Expected shape is (K, L) or (K, B, L),
+ where K is the number of codebooks, B is the batch size and L is the sequence length.
+
+ Returns:
+ Tensor: Features of shape (B, C, L), where B is the batch size, C denotes the feature dimension,
+ and L is the sequence length.
+ """
+ assert isinstance(
+ self.feature_extractor, EncodecFeatures
+ ), "Feature extractor should be an instance of EncodecFeatures"
+
+ if codes.dim() == 2:
+ codes = codes.unsqueeze(1)
+
+ n_bins = self.feature_extractor.encodec.quantizer.bins
+ offsets = torch.arange(0, n_bins * len(codes), n_bins, device=codes.device)
+ embeddings_idxs = codes + offsets.view(-1, 1, 1)
+ features = torch.nn.functional.embedding(embeddings_idxs, self.feature_extractor.codebook_weights).sum(dim=0)
+ features = features.transpose(1, 2)
+
+ return features
diff --git a/vocos/vocos/spectral_ops.py b/vocos/vocos/spectral_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8eda1c8e18a32406aad40415f6a8bf60eb15fea
--- /dev/null
+++ b/vocos/vocos/spectral_ops.py
@@ -0,0 +1,192 @@
+import numpy as np
+import scipy
+import torch
+from torch import nn, view_as_real, view_as_complex
+
+
+class ISTFT(nn.Module):
+ """
+ Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+ windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+ See issue: https://github.com/pytorch/pytorch/issues/62323
+ Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+ The NOLA constraint is met as we trim padded samples anyway.
+
+ Args:
+ n_fft (int): Size of Fourier transform.
+ hop_length (int): The distance between neighboring sliding window frames.
+ win_length (int): The size of window frame and STFT filter.
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+ """
+
+ def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
+ super().__init__()
+ if padding not in ["center", "same"]:
+ raise ValueError("Padding must be 'center' or 'same'.")
+ self.padding = padding
+ self.n_fft = n_fft
+ self.hop_length = hop_length
+ self.win_length = win_length
+ window = torch.hann_window(win_length)
+ self.register_buffer("window", window)
+
+ def forward(self, spec: torch.Tensor) -> torch.Tensor:
+ """
+ Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+
+ Args:
+ spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+ N is the number of frequency bins, and T is the number of time frames.
+
+ Returns:
+ Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+ """
+ if self.padding == "center":
+ # Fallback to pytorch native implementation
+ return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
+ elif self.padding == "same":
+ pad = (self.win_length - self.hop_length) // 2
+ else:
+ raise ValueError("Padding must be 'center' or 'same'.")
+
+ assert spec.dim() == 3, "Expected a 3D tensor as input"
+ B, N, T = spec.shape
+
+ # Inverse FFT
+ ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+ ifft = ifft * self.window[None, :, None]
+
+ # Overlap and Add
+ output_size = (T - 1) * self.hop_length + self.win_length
+ y = torch.nn.functional.fold(
+ ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+ )[:, 0, 0, pad:-pad]
+
+ # Window envelope
+ window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+ window_envelope = torch.nn.functional.fold(
+ window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+ ).squeeze()[pad:-pad]
+
+ # Normalize
+ assert (window_envelope > 1e-11).all()
+ y = y / window_envelope
+
+ return y
+
+
+class MDCT(nn.Module):
+ """
+ Modified Discrete Cosine Transform (MDCT) module.
+
+ Args:
+ frame_len (int): Length of the MDCT frame.
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+ """
+
+ def __init__(self, frame_len: int, padding: str = "same"):
+ super().__init__()
+ if padding not in ["center", "same"]:
+ raise ValueError("Padding must be 'center' or 'same'.")
+ self.padding = padding
+ self.frame_len = frame_len
+ N = frame_len // 2
+ n0 = (N + 1) / 2
+ window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
+ self.register_buffer("window", window)
+
+ pre_twiddle = torch.exp(-1j * torch.pi * torch.arange(frame_len) / frame_len)
+ post_twiddle = torch.exp(-1j * torch.pi * n0 * (torch.arange(N) + 0.5) / N)
+ # view_as_real: NCCL Backend does not support ComplexFloat data type
+ # https://github.com/pytorch/pytorch/issues/71613
+ self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
+ self.register_buffer("post_twiddle", view_as_real(post_twiddle))
+
+ def forward(self, audio: torch.Tensor) -> torch.Tensor:
+ """
+ Apply the Modified Discrete Cosine Transform (MDCT) to the input audio.
+
+ Args:
+ audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size
+ and T is the length of the audio.
+
+ Returns:
+ Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames
+ and N is the number of frequency bins.
+ """
+ if self.padding == "center":
+ audio = torch.nn.functional.pad(audio, (self.frame_len // 2, self.frame_len // 2))
+ elif self.padding == "same":
+ # hop_length is 1/2 frame_len
+ audio = torch.nn.functional.pad(audio, (self.frame_len // 4, self.frame_len // 4))
+ else:
+ raise ValueError("Padding must be 'center' or 'same'.")
+
+ x = audio.unfold(-1, self.frame_len, self.frame_len // 2)
+ N = self.frame_len // 2
+ x = x * self.window.expand(x.shape)
+ X = torch.fft.fft(x * view_as_complex(self.pre_twiddle).expand(x.shape), dim=-1)[..., :N]
+ res = X * view_as_complex(self.post_twiddle).expand(X.shape) * np.sqrt(1 / N)
+ return torch.real(res) * np.sqrt(2)
+
+
+class IMDCT(nn.Module):
+ """
+ Inverse Modified Discrete Cosine Transform (IMDCT) module.
+
+ Args:
+ frame_len (int): Length of the MDCT frame.
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+ """
+
+ def __init__(self, frame_len: int, padding: str = "same"):
+ super().__init__()
+ if padding not in ["center", "same"]:
+ raise ValueError("Padding must be 'center' or 'same'.")
+ self.padding = padding
+ self.frame_len = frame_len
+ N = frame_len // 2
+ n0 = (N + 1) / 2
+ window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
+ self.register_buffer("window", window)
+
+ pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N)
+ post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2))
+ self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
+ self.register_buffer("post_twiddle", view_as_real(post_twiddle))
+
+ def forward(self, X: torch.Tensor) -> torch.Tensor:
+ """
+ Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients.
+
+ Args:
+ X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size,
+ L is the number of frames, and N is the number of frequency bins.
+
+ Returns:
+ Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio.
+ """
+ B, L, N = X.shape
+ Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device)
+ Y[..., :N] = X
+ Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,)))
+ y = torch.fft.ifft(Y * view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1)
+ y = torch.real(y * view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2)
+ result = y * self.window.expand(y.shape)
+ output_size = (1, (L + 1) * N)
+ audio = torch.nn.functional.fold(
+ result.transpose(1, 2),
+ output_size=output_size,
+ kernel_size=(1, self.frame_len),
+ stride=(1, self.frame_len // 2),
+ )[:, 0, 0, :]
+
+ if self.padding == "center":
+ pad = self.frame_len // 2
+ elif self.padding == "same":
+ pad = self.frame_len // 4
+ else:
+ raise ValueError("Padding must be 'center' or 'same'.")
+
+ audio = audio[:, pad:-pad]
+ return audio
diff --git a/vocos/vocos_inference.ipynb b/vocos/vocos_inference.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..dd8ef5acae97ba016cc4589a97880f60d0412d3e
--- /dev/null
+++ b/vocos/vocos_inference.ipynb
@@ -0,0 +1,367 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/ubuntu/vocos\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/ubuntu/miniconda3/envs/respair/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
+ " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
+ ]
+ }
+ ],
+ "source": [
+ "%cd /home/ubuntu/vocos/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_2705444/1667309830.py:12: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+ " raw_model = torch.load(checkpoint_path, map_location=device)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from IPython.display import Audio\n",
+ "import torch\n",
+ "import librosa\n",
+ "import torchaudio\n",
+ "from vocos.pretrained import Vocos\n",
+ "\n",
+ "\n",
+ "\n",
+ "def load_vocos(checkpoint_path, config_path, device):\n",
+ " model = Vocos.from_hparams(config_path).to(device)\n",
+ "\n",
+ " raw_model = torch.load(checkpoint_path, map_location=device)\n",
+ " raw_model = raw_model if 'state_dict' not in raw_model else raw_model['state_dict']\n",
+ " model.load_state_dict(raw_model, strict=False)\n",
+ " model.eval()\n",
+ " return model\n",
+ "\n",
+ "\n",
+ "checkpoint_path = \"/home/ubuntu/vocos/logs/lightning_logs/version_25/checkpoints/last.ckpt\"\n",
+ "config_path = \"/home/ubuntu/vocos/logs/lightning_logs/version_25/config.yaml\"\n",
+ "device = \"cpu\"\n",
+ "\n",
+ "vocos = load_vocos(checkpoint_path, config_path, device)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 201,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_1848695/874175190.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+ " mel_tensor = torch.load(\"/home/ubuntu/respair/Darya_AuxiliaryASR/HiFTNet/test.pt\")\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 201,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:\n",
+ " \"\"\"\n",
+ " Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.\n",
+ "\n",
+ " Args:\n",
+ " x (Tensor): Input tensor.\n",
+ " clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.\n",
+ "\n",
+ " Returns:\n",
+ " Tensor: Element-wise logarithm of the input tensor with clipping applied.\n",
+ " \"\"\"\n",
+ " return torch.log(torch.clip(x, min=clip_val))\n",
+ "\n",
+ "\n",
+ "mel_tensor = torch.load(\"/home/ubuntu/respair/Darya_AuxiliaryASR/HiFTNet/test.pt\")\n",
+ "# mel = safe_log(mel)\n",
+ "\n",
+ "mean = -4\n",
+ "std = 4\n",
+ "\n",
+ "# Reverse normalization and logarithmic transform\n",
+ "denormalized = mel_tensor * std + mean\n",
+ "mel_plus_epsilon = torch.exp(denormalized)\n",
+ "original_mel = mel_plus_epsilon - 1e-5\n",
+ "\n",
+ "# Ensure non-negative values (mel spectrograms can't be negative)\n",
+ "mel_tensor = torch.clamp(original_mel, min=0)\n",
+ "\n",
+ "mel_tensor = safe_log(mel_tensor)\n",
+ "\n",
+ "# Original mel: [n_mels, time]\n",
+ "mel_tensor = F.interpolate(\n",
+ " mel_tensor, # Add batch and channel dims\n",
+ " scale_factor=.9, # Halve the time dimension\n",
+ " mode=\"area\" # Linear interpolation\n",
+ ")\n",
+ "\n",
+ "audio = vocos.decode(mel_tensor.to('cuda'))\n",
+ "Audio(audio.cpu().numpy(), rate=24000)\n",
+ "\n",
+ "# mel_tensor = (torch.log(1e-5 + original_mel) - -1) / 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_2705444/713572534.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+ " x = torch.load(\"/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/gt.pt\")[:1,:,:].to('cpu')\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch.nn.functional as F\n",
+ "\n",
+ "x = torch.load(\"/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/gt.pt\")[:1,:,:].to('cpu')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "to_mel = torchaudio.transforms.MelSpectrogram(sample_rate=44_100,\n",
+ " n_mels=128, n_fft=2048, win_length=2048, hop_length=512)\n",
+ "mean, std = -4, 4\n",
+ "\n",
+ "def preprocess(wave):\n",
+ " \n",
+ " wave_tensor = torch.from_numpy(wave).float()\n",
+ " mel_tensor = to_mel(wave_tensor)\n",
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
+ " return mel_tensor.to('cpu')\n",
+ "\n",
+ "\n",
+ "wav = librosa.load(\"/home/ubuntu/respair/jpn/miside/voices_combined/LocationDialogue_Location1_25_28.wav\", sr=44_100)[0]\n",
+ "\n",
+ "\n",
+ "mel = preprocess(wav)\n",
+ "\n",
+ "\n",
+ "\n",
+ "audio = vocos.decode(x)\n",
+ "Audio(audio.cpu().numpy(), rate=44_100)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([18, 72192])"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "audio.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "LibsndfileError",
+ "evalue": "Error opening '/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav': Format not recognised.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mLibsndfileError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msoundfile\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msf\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m sf\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav\u001b[39m\u001b[38;5;124m\"\u001b[39m, audio, \u001b[38;5;241m44_100\u001b[39m)\n",
+ "File \u001b[0;32m~/miniconda3/envs/respair/lib/python3.11/site-packages/soundfile.py:363\u001b[0m, in \u001b[0;36mwrite\u001b[0;34m(file, data, samplerate, subtype, endian, format, closefd, compression_level, bitrate_mode)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 362\u001b[0m channels \u001b[38;5;241m=\u001b[39m data\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m SoundFile(file, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m, samplerate, channels,\n\u001b[1;32m 364\u001b[0m subtype, endian, \u001b[38;5;28mformat\u001b[39m, closefd,\n\u001b[1;32m 365\u001b[0m compression_level, bitrate_mode) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 366\u001b[0m f\u001b[38;5;241m.\u001b[39mwrite(data)\n",
+ "File \u001b[0;32m~/miniconda3/envs/respair/lib/python3.11/site-packages/soundfile.py:690\u001b[0m, in \u001b[0;36mSoundFile.__init__\u001b[0;34m(self, file, mode, samplerate, channels, subtype, endian, format, closefd, compression_level, bitrate_mode)\u001b[0m\n\u001b[1;32m 687\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_bitrate_mode \u001b[38;5;241m=\u001b[39m bitrate_mode\n\u001b[1;32m 688\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info \u001b[38;5;241m=\u001b[39m _create_info_struct(file, mode, samplerate, channels,\n\u001b[1;32m 689\u001b[0m \u001b[38;5;28mformat\u001b[39m, subtype, endian)\n\u001b[0;32m--> 690\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_open(file, mode_int, closefd)\n\u001b[1;32m 691\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mset\u001b[39m(mode)\u001b[38;5;241m.\u001b[39missuperset(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr+\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mseekable():\n\u001b[1;32m 692\u001b[0m \u001b[38;5;66;03m# Move write position to 0 (like in Python file objects)\u001b[39;00m\n\u001b[1;32m 693\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n",
+ "File \u001b[0;32m~/miniconda3/envs/respair/lib/python3.11/site-packages/soundfile.py:1265\u001b[0m, in \u001b[0;36mSoundFile._open\u001b[0;34m(self, file, mode_int, closefd)\u001b[0m\n\u001b[1;32m 1262\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file_ptr \u001b[38;5;241m==\u001b[39m _ffi\u001b[38;5;241m.\u001b[39mNULL:\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;66;03m# get the actual error code\u001b[39;00m\n\u001b[1;32m 1264\u001b[0m err \u001b[38;5;241m=\u001b[39m _snd\u001b[38;5;241m.\u001b[39msf_error(file_ptr)\n\u001b[0;32m-> 1265\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LibsndfileError(err, prefix\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError opening \u001b[39m\u001b[38;5;132;01m{0!r}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname))\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mode_int \u001b[38;5;241m==\u001b[39m _snd\u001b[38;5;241m.\u001b[39mSFM_WRITE:\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# Due to a bug in libsndfile version <= 1.0.25, frames != 0\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# when opening a named pipe in SFM_WRITE mode.\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# See http://github.com/erikd/libsndfile/issues/77.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mframes \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
+ "\u001b[0;31mLibsndfileError\u001b[0m: Error opening '/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav': Format not recognised."
+ ]
+ }
+ ],
+ "source": [
+ "import soundfile as sf\n",
+ "\n",
+ "sf.write(\"/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav\", audio, 44_100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio(wav, rate=24_000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 118,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([[[-2.0122, -1.7062, -0.6355, ..., -1.2065, -0.7076, -1.2003],\n",
+ " [-2.0825, -1.8459, -0.6099, ..., -1.1121, -1.1247, -1.2791],\n",
+ " [-1.8935, -1.7773, -0.6214, ..., -1.2913, -0.8913, -0.9125],\n",
+ " ...,\n",
+ " [-1.8168, -1.8461, -1.7281, ..., -1.5361, -1.5047, -1.5924],\n",
+ " [-1.8209, -1.8349, -1.7215, ..., -1.6026, -1.5498, -1.5136],\n",
+ " [-1.7768, -1.8042, -1.7079, ..., -1.7335, -1.5213, -1.5378]]])"
+ ]
+ },
+ "execution_count": 118,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "respair",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}