aryo100 commited on
Commit
2b7fad2
·
1 Parent(s): 152a06c

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +186 -0
  2. .vscode/settings.json +7 -0
  3. LICENSE +21 -0
  4. README-ja.md +54 -0
  5. README.md +52 -8
  6. bin/.gitignore +2 -0
  7. configs/32k-768.json +47 -0
  8. configs/32k.json +47 -0
  9. configs/40k-768.json +47 -0
  10. configs/40k.json +47 -0
  11. configs/48k-768.json +47 -0
  12. configs/48k.json +47 -0
  13. dev.py +3 -0
  14. launch.py +139 -0
  15. lib/rvc/attentions.py +415 -0
  16. lib/rvc/checkpoints.py +149 -0
  17. lib/rvc/commons.py +163 -0
  18. lib/rvc/config.py +71 -0
  19. lib/rvc/data_utils.py +515 -0
  20. lib/rvc/losses.py +58 -0
  21. lib/rvc/mel_processing.py +113 -0
  22. lib/rvc/models.py +853 -0
  23. lib/rvc/modules.py +518 -0
  24. lib/rvc/pipeline.py +453 -0
  25. lib/rvc/preprocessing/extract_f0.py +221 -0
  26. lib/rvc/preprocessing/extract_feature.py +217 -0
  27. lib/rvc/preprocessing/slicer.py +179 -0
  28. lib/rvc/preprocessing/split.py +195 -0
  29. lib/rvc/train.py +998 -0
  30. lib/rvc/transforms.py +207 -0
  31. lib/rvc/utils.py +225 -0
  32. models/checkpoints/.gitignore +2 -0
  33. models/embeddings/.gitignore +2 -0
  34. models/pretrained/.gitignore +2 -0
  35. models/training/.gitignore +6 -0
  36. models/training/models/.gitignore +2 -0
  37. models/training/mute/0_gt_wavs/mute32k.wav +3 -0
  38. models/training/mute/0_gt_wavs/mute40k.wav +3 -0
  39. models/training/mute/0_gt_wavs/mute48k.wav +3 -0
  40. models/training/mute/1_16k_wavs/mute.wav +3 -0
  41. models/training/mute/2a_f0/mute.wav.npy +3 -0
  42. models/training/mute/2b_f0nsf/mute.wav.npy +3 -0
  43. models/training/mute/3_feature256/mute.npy +3 -0
  44. modules/cmd_opts.py +22 -0
  45. modules/core.py +156 -0
  46. modules/merge.py +81 -0
  47. modules/models.py +266 -0
  48. modules/separate.py +82 -0
  49. modules/server/model.py +451 -0
  50. modules/shared.py +44 -0
.gitignore ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+
3
+ tmp/
4
+
5
+
6
+ ### Generated by gibo (https://github.com/simonwhitaker/gibo)
7
+ ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Global/VisualStudioCode.gitignore
8
+
9
+ .vscode/*
10
+ !.vscode/settings.json
11
+ !.vscode/tasks.json
12
+ !.vscode/launch.json
13
+ !.vscode/extensions.json
14
+ !.vscode/*.code-snippets
15
+
16
+ # Local History for Visual Studio Code
17
+ .history/
18
+
19
+ # Built Visual Studio Code Extensions
20
+ *.vsix
21
+
22
+
23
+ ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore
24
+
25
+ # Byte-compiled / optimized / DLL files
26
+ __pycache__/
27
+ *.py[cod]
28
+ *$py.class
29
+
30
+ # C extensions
31
+ *.so
32
+
33
+ # Distribution / packaging
34
+ .Python
35
+ build/
36
+ develop-eggs/
37
+ dist/
38
+ downloads/
39
+ eggs/
40
+ .eggs/
41
+ # lib/
42
+ lib64/
43
+ parts/
44
+ sdist/
45
+ var/
46
+ wheels/
47
+ share/python-wheels/
48
+ *.egg-info/
49
+ .installed.cfg
50
+ *.egg
51
+ MANIFEST
52
+
53
+ # PyInstaller
54
+ # Usually these files are written by a python script from a template
55
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
56
+ *.manifest
57
+ *.spec
58
+
59
+ # Installer logs
60
+ pip-log.txt
61
+ pip-delete-this-directory.txt
62
+
63
+ # Unit test / coverage reports
64
+ htmlcov/
65
+ .tox/
66
+ .nox/
67
+ .coverage
68
+ .coverage.*
69
+ .cache
70
+ nosetests.xml
71
+ coverage.xml
72
+ *.cover
73
+ *.py,cover
74
+ .hypothesis/
75
+ .pytest_cache/
76
+ cover/
77
+
78
+ # Translations
79
+ *.mo
80
+ *.pot
81
+
82
+ # Django stuff:
83
+ *.log
84
+ local_settings.py
85
+ db.sqlite3
86
+ db.sqlite3-journal
87
+
88
+ # Flask stuff:
89
+ instance/
90
+ .webassets-cache
91
+
92
+ # Scrapy stuff:
93
+ .scrapy
94
+
95
+ # Sphinx documentation
96
+ docs/_build/
97
+
98
+ # PyBuilder
99
+ .pybuilder/
100
+ target/
101
+
102
+ # Jupyter Notebook
103
+ .ipynb_checkpoints
104
+
105
+ # IPython
106
+ profile_default/
107
+ ipython_config.py
108
+
109
+ # pyenv
110
+ # For a library or package, you might want to ignore these files since the code is
111
+ # intended to run in multiple environments; otherwise, check them in:
112
+ # .python-version
113
+
114
+ # pipenv
115
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
117
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
118
+ # install all needed dependencies.
119
+ #Pipfile.lock
120
+
121
+ # poetry
122
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
123
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
124
+ # commonly ignored for libraries.
125
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
126
+ #poetry.lock
127
+
128
+ # pdm
129
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
130
+ #pdm.lock
131
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
132
+ # in version control.
133
+ # https://pdm.fming.dev/#use-with-ide
134
+ .pdm.toml
135
+
136
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
137
+ __pypackages__/
138
+
139
+ # Celery stuff
140
+ celerybeat-schedule
141
+ celerybeat.pid
142
+
143
+ # SageMath parsed files
144
+ *.sage.py
145
+
146
+ # Environments
147
+ .env
148
+ .venv
149
+ env/
150
+ venv/
151
+ ENV/
152
+ env.bak/
153
+ venv.bak/
154
+
155
+ # Spyder project settings
156
+ .spyderproject
157
+ .spyproject
158
+
159
+ # Rope project settings
160
+ .ropeproject
161
+
162
+ # mkdocs documentation
163
+ /site
164
+
165
+ # mypy
166
+ .mypy_cache/
167
+ .dmypy.json
168
+ dmypy.json
169
+
170
+ # Pyre type checker
171
+ .pyre/
172
+
173
+ # pytype static type analyzer
174
+ .pytype/
175
+
176
+ # Cython debug symbols
177
+ cython_debug/
178
+
179
+ # PyCharm
180
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
181
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
182
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
183
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
184
+ #.idea/
185
+
186
+
.vscode/settings.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "python.formatting.provider": "black",
3
+ "editor.codeActionsOnSave": {
4
+ "source.organizeImports": true
5
+ },
6
+ "editor.formatOnSave": true,
7
+ }
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 ddPn08
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README-ja.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">RVC-WebUI</h1>
2
+ <div align="center">
3
+ <p>
4
+
5
+ [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) の再構築プロジェクト
6
+
7
+ </p>
8
+ </div>
9
+
10
+ ---
11
+
12
+ <div align="center">
13
+ <p>
14
+
15
+ [日本語](README-ja.md) | [English](README.md)
16
+
17
+ </p>
18
+ </div>
19
+
20
+ <br >
21
+
22
+ # 起動
23
+
24
+ ## Windows
25
+ `webui-user.bat` をダブルクリックして、webuiを起動します。
26
+
27
+ ## Linux or Mac
28
+ `webui.sh` を実行して、webuiを起動します。
29
+
30
+ <br >
31
+
32
+ ```
33
+ テスト環境: Windows 10, Python 3.10.9, torch 2.0.0+cu118
34
+ ```
35
+
36
+ <br >
37
+
38
+ # トラブルシューティング
39
+
40
+ ## `error: Microsoft Visual C++ 14.0 or greater is required.`
41
+
42
+ Microsoft C++ Build Tools がインストールされている必要があります。
43
+
44
+ ### Step 1: インストーラーをダウンロード
45
+ [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16)
46
+
47
+ ### Step 2: `C++ Build Tools` をインストール
48
+ インストーラーを実行し、`Workloads` タブで `C++ Build Tools` を選択します。
49
+
50
+ <br >
51
+
52
+ # クレジット
53
+ - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI)
54
+ - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection)
README.md CHANGED
@@ -1,10 +1,54 @@
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Rvc Webui
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">RVC-WebUI</h1>
2
+ <div align="center">
3
+ <p>
4
+
5
+ [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) reconstruction project
6
+
7
+ </p>
8
+ </div>
9
+
10
  ---
 
 
 
 
 
 
 
11
 
12
+ <div align="center">
13
+ <p>
14
+
15
+ [日本語](README-ja.md) | [English](README.md)
16
+
17
+ </p>
18
+ </div>
19
+
20
+ <br >
21
+
22
+ # Launch
23
+
24
+ ## Windows
25
+ Double click `webui-user.bat` to start the webui.
26
+
27
+ ## Linux or Mac
28
+ Run `webui.sh` to start the webui.
29
+
30
+ <br >
31
+
32
+ ```
33
+ Tested environment: Windows 10, Python 3.10.9, torch 2.0.0+cu118
34
+ ```
35
+
36
+ <br >
37
+
38
+ # Troubleshooting
39
+
40
+ ## `error: Microsoft Visual C++ 14.0 or greater is required.`
41
+
42
+ Microsoft C++ Build Tools must be installed.
43
+
44
+ ### Step 1: Download the installer
45
+ [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16)
46
+
47
+ ### Step 2: Install `C++ Build Tools`
48
+ Run the installer and select `C++ Build Tools` in the `Workloads` tab.
49
+
50
+ <br >
51
+
52
+ # Credits
53
+ - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI)
54
+ - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection)
bin/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
configs/32k-768.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "emb_channels": 768,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
configs/32k.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "emb_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
configs/40k-768.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "emb_channels": 768,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
configs/40k.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "emb_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
configs/48k-768.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "emb_channels": 768,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
configs/48k.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "emb_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
dev.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import modules.ui as ui
2
+
3
+ demo = ui.create_ui()
launch.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import shlex
4
+ import subprocess
5
+ import sys
6
+
7
+ commandline_args = os.environ.get("COMMANDLINE_ARGS", "")
8
+ sys.argv += shlex.split(commandline_args)
9
+
10
+ python = sys.executable
11
+ git = os.environ.get("GIT", "git")
12
+ index_url = os.environ.get("INDEX_URL", "")
13
+ stored_commit_hash = None
14
+ skip_install = False
15
+
16
+
17
+ def run(command, desc=None, errdesc=None, custom_env=None):
18
+ if desc is not None:
19
+ print(desc)
20
+
21
+ result = subprocess.run(
22
+ command,
23
+ stdout=subprocess.PIPE,
24
+ stderr=subprocess.PIPE,
25
+ shell=True,
26
+ env=os.environ if custom_env is None else custom_env,
27
+ )
28
+
29
+ if result.returncode != 0:
30
+ message = f"""{errdesc or 'Error running command'}.
31
+ Command: {command}
32
+ Error code: {result.returncode}
33
+ stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else '<empty>'}
34
+ stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else '<empty>'}
35
+ """
36
+ raise RuntimeError(message)
37
+
38
+ return result.stdout.decode(encoding="utf8", errors="ignore")
39
+
40
+
41
+ def check_run(command):
42
+ result = subprocess.run(
43
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
44
+ )
45
+ return result.returncode == 0
46
+
47
+
48
+ def is_installed(package):
49
+ try:
50
+ spec = importlib.util.find_spec(package)
51
+ except ModuleNotFoundError:
52
+ return False
53
+
54
+ return spec is not None
55
+
56
+
57
+ def commit_hash():
58
+ global stored_commit_hash
59
+
60
+ if stored_commit_hash is not None:
61
+ return stored_commit_hash
62
+
63
+ try:
64
+ stored_commit_hash = run(f"{git} rev-parse HEAD").strip()
65
+ except Exception:
66
+ stored_commit_hash = "<none>"
67
+
68
+ return stored_commit_hash
69
+
70
+
71
+ def run_pip(args, desc=None):
72
+ if skip_install:
73
+ return
74
+
75
+ index_url_line = f" --index-url {index_url}" if index_url != "" else ""
76
+ return run(
77
+ f'"{python}" -m pip {args} --prefer-binary{index_url_line}',
78
+ desc=f"Installing {desc}",
79
+ errdesc=f"Couldn't install {desc}",
80
+ )
81
+
82
+
83
+ def run_python(code, desc=None, errdesc=None):
84
+ return run(f'"{python}" -c "{code}"', desc, errdesc)
85
+
86
+
87
+ def extract_arg(args, name):
88
+ return [x for x in args if x != name], name in args
89
+
90
+
91
+ def prepare_environment():
92
+ commit = commit_hash()
93
+
94
+ print(f"Python {sys.version}")
95
+ print(f"Commit hash: {commit}")
96
+
97
+ torch_command = os.environ.get(
98
+ "TORCH_COMMAND",
99
+ "pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118",
100
+ )
101
+
102
+ sys.argv, skip_install = extract_arg(sys.argv, "--skip-install")
103
+ if skip_install:
104
+ return
105
+
106
+ sys.argv, reinstall_torch = extract_arg(sys.argv, "--reinstall-torch")
107
+ ngrok = "--ngrok" in sys.argv
108
+
109
+ if reinstall_torch or not is_installed("torch") or not is_installed("torchaudio"):
110
+ run(
111
+ f'"{python}" -m {torch_command}',
112
+ "Installing torch and torchaudio",
113
+ "Couldn't install torch",
114
+ )
115
+
116
+ if not is_installed("pyngrok") and ngrok:
117
+ run_pip("install pyngrok", "ngrok")
118
+
119
+ run(
120
+ f'"{python}" -m pip install -r requirements.txt',
121
+ desc=f"Installing requirements",
122
+ errdesc=f"Couldn't install requirements",
123
+ )
124
+
125
+
126
+ def start():
127
+ os.environ["PATH"] = (
128
+ os.path.join(os.path.dirname(__file__), "bin")
129
+ + os.pathsep
130
+ + os.environ.get("PATH", "")
131
+ )
132
+ subprocess.run(
133
+ [python, "webui.py", *sys.argv[1:]],
134
+ )
135
+
136
+
137
+ if __name__ == "__main__":
138
+ prepare_environment()
139
+ start()
lib/rvc/attentions.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from . import commons
8
+ from .modules import LayerNorm
9
+
10
+
11
+ class Encoder(nn.Module):
12
+ def __init__(
13
+ self,
14
+ hidden_channels,
15
+ filter_channels,
16
+ n_heads,
17
+ n_layers,
18
+ kernel_size=1,
19
+ p_dropout=0.0,
20
+ window_size=10,
21
+ **kwargs
22
+ ):
23
+ super().__init__()
24
+ self.hidden_channels = hidden_channels
25
+ self.filter_channels = filter_channels
26
+ self.n_heads = n_heads
27
+ self.n_layers = n_layers
28
+ self.kernel_size = kernel_size
29
+ self.p_dropout = p_dropout
30
+ self.window_size = window_size
31
+
32
+ self.drop = nn.Dropout(p_dropout)
33
+ self.attn_layers = nn.ModuleList()
34
+ self.norm_layers_1 = nn.ModuleList()
35
+ self.ffn_layers = nn.ModuleList()
36
+ self.norm_layers_2 = nn.ModuleList()
37
+ for i in range(self.n_layers):
38
+ self.attn_layers.append(
39
+ MultiHeadAttention(
40
+ hidden_channels,
41
+ hidden_channels,
42
+ n_heads,
43
+ p_dropout=p_dropout,
44
+ window_size=window_size,
45
+ )
46
+ )
47
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
48
+ self.ffn_layers.append(
49
+ FFN(
50
+ hidden_channels,
51
+ hidden_channels,
52
+ filter_channels,
53
+ kernel_size,
54
+ p_dropout=p_dropout,
55
+ )
56
+ )
57
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
58
+
59
+ def forward(self, x, x_mask):
60
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
61
+ x = x * x_mask
62
+ for i in range(self.n_layers):
63
+ y = self.attn_layers[i](x, x, attn_mask)
64
+ y = self.drop(y)
65
+ x = self.norm_layers_1[i](x + y)
66
+
67
+ y = self.ffn_layers[i](x, x_mask)
68
+ y = self.drop(y)
69
+ x = self.norm_layers_2[i](x + y)
70
+ x = x * x_mask
71
+ return x
72
+
73
+
74
+ class Decoder(nn.Module):
75
+ def __init__(
76
+ self,
77
+ hidden_channels,
78
+ filter_channels,
79
+ n_heads,
80
+ n_layers,
81
+ kernel_size=1,
82
+ p_dropout=0.0,
83
+ proximal_bias=False,
84
+ proximal_init=True,
85
+ **kwargs
86
+ ):
87
+ super().__init__()
88
+ self.hidden_channels = hidden_channels
89
+ self.filter_channels = filter_channels
90
+ self.n_heads = n_heads
91
+ self.n_layers = n_layers
92
+ self.kernel_size = kernel_size
93
+ self.p_dropout = p_dropout
94
+ self.proximal_bias = proximal_bias
95
+ self.proximal_init = proximal_init
96
+
97
+ self.drop = nn.Dropout(p_dropout)
98
+ self.self_attn_layers = nn.ModuleList()
99
+ self.norm_layers_0 = nn.ModuleList()
100
+ self.encdec_attn_layers = nn.ModuleList()
101
+ self.norm_layers_1 = nn.ModuleList()
102
+ self.ffn_layers = nn.ModuleList()
103
+ self.norm_layers_2 = nn.ModuleList()
104
+ for i in range(self.n_layers):
105
+ self.self_attn_layers.append(
106
+ MultiHeadAttention(
107
+ hidden_channels,
108
+ hidden_channels,
109
+ n_heads,
110
+ p_dropout=p_dropout,
111
+ proximal_bias=proximal_bias,
112
+ proximal_init=proximal_init,
113
+ )
114
+ )
115
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
116
+ self.encdec_attn_layers.append(
117
+ MultiHeadAttention(
118
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
119
+ )
120
+ )
121
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
122
+ self.ffn_layers.append(
123
+ FFN(
124
+ hidden_channels,
125
+ hidden_channels,
126
+ filter_channels,
127
+ kernel_size,
128
+ p_dropout=p_dropout,
129
+ causal=True,
130
+ )
131
+ )
132
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
133
+
134
+ def forward(self, x, x_mask, h, h_mask):
135
+ """
136
+ x: decoder input
137
+ h: encoder output
138
+ """
139
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
140
+ device=x.device, dtype=x.dtype
141
+ )
142
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
143
+ x = x * x_mask
144
+ for i in range(self.n_layers):
145
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
146
+ y = self.drop(y)
147
+ x = self.norm_layers_0[i](x + y)
148
+
149
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
150
+ y = self.drop(y)
151
+ x = self.norm_layers_1[i](x + y)
152
+
153
+ y = self.ffn_layers[i](x, x_mask)
154
+ y = self.drop(y)
155
+ x = self.norm_layers_2[i](x + y)
156
+ x = x * x_mask
157
+ return x
158
+
159
+
160
+ class MultiHeadAttention(nn.Module):
161
+ def __init__(
162
+ self,
163
+ channels,
164
+ out_channels,
165
+ n_heads,
166
+ p_dropout=0.0,
167
+ window_size=None,
168
+ heads_share=True,
169
+ block_length=None,
170
+ proximal_bias=False,
171
+ proximal_init=False,
172
+ ):
173
+ super().__init__()
174
+ assert channels % n_heads == 0
175
+
176
+ self.channels = channels
177
+ self.out_channels = out_channels
178
+ self.n_heads = n_heads
179
+ self.p_dropout = p_dropout
180
+ self.window_size = window_size
181
+ self.heads_share = heads_share
182
+ self.block_length = block_length
183
+ self.proximal_bias = proximal_bias
184
+ self.proximal_init = proximal_init
185
+ self.attn = None
186
+
187
+ self.k_channels = channels // n_heads
188
+ self.conv_q = nn.Conv1d(channels, channels, 1)
189
+ self.conv_k = nn.Conv1d(channels, channels, 1)
190
+ self.conv_v = nn.Conv1d(channels, channels, 1)
191
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
192
+ self.drop = nn.Dropout(p_dropout)
193
+
194
+ if window_size is not None:
195
+ n_heads_rel = 1 if heads_share else n_heads
196
+ rel_stddev = self.k_channels**-0.5
197
+ self.emb_rel_k = nn.Parameter(
198
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
199
+ * rel_stddev
200
+ )
201
+ self.emb_rel_v = nn.Parameter(
202
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
203
+ * rel_stddev
204
+ )
205
+
206
+ nn.init.xavier_uniform_(self.conv_q.weight)
207
+ nn.init.xavier_uniform_(self.conv_k.weight)
208
+ nn.init.xavier_uniform_(self.conv_v.weight)
209
+ if proximal_init:
210
+ with torch.no_grad():
211
+ self.conv_k.weight.copy_(self.conv_q.weight)
212
+ self.conv_k.bias.copy_(self.conv_q.bias)
213
+
214
+ def forward(self, x, c, attn_mask=None):
215
+ q = self.conv_q(x)
216
+ k = self.conv_k(c)
217
+ v = self.conv_v(c)
218
+
219
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
220
+
221
+ x = self.conv_o(x)
222
+ return x
223
+
224
+ def attention(self, query, key, value, mask=None):
225
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
226
+ b, d, t_s, t_t = (*key.size(), query.size(2))
227
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
228
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
229
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
230
+
231
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
232
+ if self.window_size is not None:
233
+ assert (
234
+ t_s == t_t
235
+ ), "Relative attention is only available for self-attention."
236
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
237
+ rel_logits = self._matmul_with_relative_keys(
238
+ query / math.sqrt(self.k_channels), key_relative_embeddings
239
+ )
240
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
241
+ scores = scores + scores_local
242
+ if self.proximal_bias:
243
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
244
+ scores = scores + self._attention_bias_proximal(t_s).to(
245
+ device=scores.device, dtype=scores.dtype
246
+ )
247
+ if mask is not None:
248
+ scores = scores.masked_fill(mask == 0, -1e4)
249
+ if self.block_length is not None:
250
+ assert (
251
+ t_s == t_t
252
+ ), "Local attention is only available for self-attention."
253
+ block_mask = (
254
+ torch.ones_like(scores)
255
+ .triu(-self.block_length)
256
+ .tril(self.block_length)
257
+ )
258
+ scores = scores.masked_fill(block_mask == 0, -1e4)
259
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
260
+ p_attn = self.drop(p_attn)
261
+ output = torch.matmul(p_attn, value)
262
+ if self.window_size is not None:
263
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
264
+ value_relative_embeddings = self._get_relative_embeddings(
265
+ self.emb_rel_v, t_s
266
+ )
267
+ output = output + self._matmul_with_relative_values(
268
+ relative_weights, value_relative_embeddings
269
+ )
270
+ output = (
271
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
272
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
273
+ return output, p_attn
274
+
275
+ def _matmul_with_relative_values(self, x, y):
276
+ """
277
+ x: [b, h, l, m]
278
+ y: [h or 1, m, d]
279
+ ret: [b, h, l, d]
280
+ """
281
+ ret = torch.matmul(x, y.unsqueeze(0))
282
+ return ret
283
+
284
+ def _matmul_with_relative_keys(self, x, y):
285
+ """
286
+ x: [b, h, l, d]
287
+ y: [h or 1, m, d]
288
+ ret: [b, h, l, m]
289
+ """
290
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
291
+ return ret
292
+
293
+ def _get_relative_embeddings(self, relative_embeddings, length):
294
+ max_relative_position = 2 * self.window_size + 1
295
+ # Pad first before slice to avoid using cond ops.
296
+ pad_length = max(length - (self.window_size + 1), 0)
297
+ slice_start_position = max((self.window_size + 1) - length, 0)
298
+ slice_end_position = slice_start_position + 2 * length - 1
299
+ if pad_length > 0:
300
+ padded_relative_embeddings = F.pad(
301
+ relative_embeddings,
302
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
303
+ )
304
+ else:
305
+ padded_relative_embeddings = relative_embeddings
306
+ used_relative_embeddings = padded_relative_embeddings[
307
+ :, slice_start_position:slice_end_position
308
+ ]
309
+ return used_relative_embeddings
310
+
311
+ def _relative_position_to_absolute_position(self, x):
312
+ """
313
+ x: [b, h, l, 2*l-1]
314
+ ret: [b, h, l, l]
315
+ """
316
+ batch, heads, length, _ = x.size()
317
+ # Concat columns of pad to shift from relative to absolute indexing.
318
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
319
+
320
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
321
+ x_flat = x.view([batch, heads, length * 2 * length])
322
+ x_flat = F.pad(
323
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
324
+ )
325
+
326
+ # Reshape and slice out the padded elements.
327
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
328
+ :, :, :length, length - 1 :
329
+ ]
330
+ return x_final
331
+
332
+ def _absolute_position_to_relative_position(self, x):
333
+ """
334
+ x: [b, h, l, l]
335
+ ret: [b, h, l, 2*l-1]
336
+ """
337
+ batch, heads, length, _ = x.size()
338
+ # padd along column
339
+ x = F.pad(
340
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
341
+ )
342
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
343
+ # add 0's in the beginning that will skew the elements after reshape
344
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
345
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
346
+ return x_final
347
+
348
+ def _attention_bias_proximal(self, length):
349
+ """Bias for self-attention to encourage attention to close positions.
350
+ Args:
351
+ length: an integer scalar.
352
+ Returns:
353
+ a Tensor with shape [1, 1, length, length]
354
+ """
355
+ r = torch.arange(length, dtype=torch.float32)
356
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
357
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
358
+
359
+
360
+ class FFN(nn.Module):
361
+ def __init__(
362
+ self,
363
+ in_channels,
364
+ out_channels,
365
+ filter_channels,
366
+ kernel_size,
367
+ p_dropout=0.0,
368
+ activation=None,
369
+ causal=False,
370
+ ):
371
+ super().__init__()
372
+ self.in_channels = in_channels
373
+ self.out_channels = out_channels
374
+ self.filter_channels = filter_channels
375
+ self.kernel_size = kernel_size
376
+ self.p_dropout = p_dropout
377
+ self.activation = activation
378
+ self.causal = causal
379
+
380
+ if causal:
381
+ self.padding = self._causal_padding
382
+ else:
383
+ self.padding = self._same_padding
384
+
385
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
386
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
387
+ self.drop = nn.Dropout(p_dropout)
388
+
389
+ def forward(self, x, x_mask):
390
+ x = self.conv_1(self.padding(x * x_mask))
391
+ if self.activation == "gelu":
392
+ x = x * torch.sigmoid(1.702 * x)
393
+ else:
394
+ x = torch.relu(x)
395
+ x = self.drop(x)
396
+ x = self.conv_2(self.padding(x * x_mask))
397
+ return x * x_mask
398
+
399
+ def _causal_padding(self, x):
400
+ if self.kernel_size == 1:
401
+ return x
402
+ pad_l = self.kernel_size - 1
403
+ pad_r = 0
404
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
405
+ x = F.pad(x, commons.convert_pad_shape(padding))
406
+ return x
407
+
408
+ def _same_padding(self, x):
409
+ if self.kernel_size == 1:
410
+ return x
411
+ pad_l = (self.kernel_size - 1) // 2
412
+ pad_r = self.kernel_size // 2
413
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
414
+ x = F.pad(x, commons.convert_pad_shape(padding))
415
+ return x
lib/rvc/checkpoints.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import OrderedDict
3
+ from typing import *
4
+
5
+ import torch
6
+
7
+
8
+ def write_config(state_dict: Dict[str, Any], cfg: Dict[str, Any]):
9
+ state_dict["config"] = []
10
+ for key, x in cfg.items():
11
+ state_dict["config"].append(x)
12
+ state_dict["params"] = cfg
13
+
14
+
15
+ def create_trained_model(
16
+ weights: Dict[str, Any],
17
+ version: Literal["v1", "v2"],
18
+ sr: str,
19
+ f0: bool,
20
+ emb_name: str,
21
+ emb_ch: int,
22
+ emb_output_layer: int,
23
+ epoch: int,
24
+ speaker_info: Optional[dict[str, int]]
25
+ ):
26
+ state_dict = OrderedDict()
27
+ state_dict["weight"] = {}
28
+ for key in weights.keys():
29
+ if "enc_q" in key:
30
+ continue
31
+ state_dict["weight"][key] = weights[key].half()
32
+ if sr == "40k":
33
+ write_config(
34
+ state_dict,
35
+ {
36
+ "spec_channels": 1025,
37
+ "segment_size": 32,
38
+ "inter_channels": 192,
39
+ "hidden_channels": 192,
40
+ "filter_channels": 768,
41
+ "n_heads": 2,
42
+ "n_layers": 6,
43
+ "kernel_size": 3,
44
+ "p_dropout": 0,
45
+ "resblock": "1",
46
+ "resblock_kernel_sizes": [3, 7, 11],
47
+ "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
48
+ "upsample_rates": [10, 10, 2, 2],
49
+ "upsample_initial_channel": 512,
50
+ "upsample_kernel_sizes": [16, 16, 4, 4],
51
+ "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
52
+ "gin_channels": 256,
53
+ "emb_channels": emb_ch,
54
+ "sr": 40000,
55
+ },
56
+ )
57
+ elif sr == "48k":
58
+ write_config(
59
+ state_dict,
60
+ {
61
+ "spec_channels": 1025,
62
+ "segment_size": 32,
63
+ "inter_channels": 192,
64
+ "hidden_channels": 192,
65
+ "filter_channels": 768,
66
+ "n_heads": 2,
67
+ "n_layers": 6,
68
+ "kernel_size": 3,
69
+ "p_dropout": 0,
70
+ "resblock": "1",
71
+ "resblock_kernel_sizes": [3, 7, 11],
72
+ "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
73
+ "upsample_rates": [10, 6, 2, 2, 2],
74
+ "upsample_initial_channel": 512,
75
+ "upsample_kernel_sizes": [16, 16, 4, 4, 4],
76
+ "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
77
+ "gin_channels": 256,
78
+ "emb_channels": emb_ch,
79
+ "sr": 48000,
80
+ },
81
+ )
82
+ elif sr == "32k":
83
+ write_config(
84
+ state_dict,
85
+ {
86
+ "spec_channels": 513,
87
+ "segment_size": 32,
88
+ "inter_channels": 192,
89
+ "hidden_channels": 192,
90
+ "filter_channels": 768,
91
+ "n_heads": 2,
92
+ "n_layers": 6,
93
+ "kernel_size": 3,
94
+ "p_dropout": 0,
95
+ "resblock": "1",
96
+ "resblock_kernel_sizes": [3, 7, 11],
97
+ "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
98
+ "upsample_rates": [10, 4, 2, 2, 2],
99
+ "upsample_initial_channel": 512,
100
+ "upsample_kernel_sizes": [16, 16, 4, 4, 4],
101
+ "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
102
+ "gin_channels": 256,
103
+ "emb_channels": emb_ch,
104
+ "sr": 32000,
105
+ },
106
+ )
107
+ state_dict["version"] = version
108
+ state_dict["info"] = f"{epoch}epoch"
109
+ state_dict["sr"] = sr
110
+ state_dict["f0"] = 1 if f0 else 0
111
+ state_dict["embedder_name"] = emb_name
112
+ state_dict["embedder_output_layer"] = emb_output_layer
113
+ if not speaker_info is None:
114
+ state_dict["speaker_info"] = {str(v): str(k) for k, v in speaker_info.items()}
115
+ return state_dict
116
+
117
+
118
+ def save(
119
+ model,
120
+ version: Literal["v1", "v2"],
121
+ sr: str,
122
+ f0: bool,
123
+ emb_name: str,
124
+ emb_ch: int,
125
+ emb_output_layer: int,
126
+ filepath: str,
127
+ epoch: int,
128
+ speaker_info: Optional[dict[str, int]]
129
+ ):
130
+ if hasattr(model, "module"):
131
+ state_dict = model.module.state_dict()
132
+ else:
133
+ state_dict = model.state_dict()
134
+
135
+ print(f"save: emb_name: {emb_name} {emb_ch}")
136
+
137
+ state_dict = create_trained_model(
138
+ state_dict,
139
+ version,
140
+ sr,
141
+ f0,
142
+ emb_name,
143
+ emb_ch,
144
+ emb_output_layer,
145
+ epoch,
146
+ speaker_info
147
+ )
148
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
149
+ torch.save(state_dict, filepath)
lib/rvc/commons.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch.nn import functional as F
5
+
6
+
7
+ def init_weights(m, mean=0.0, std=0.01):
8
+ classname = m.__class__.__name__
9
+ if classname.find("Conv") != -1:
10
+ m.weight.data.normal_(mean, std)
11
+
12
+
13
+ def get_padding(kernel_size, dilation=1):
14
+ return int((kernel_size * dilation - dilation) / 2)
15
+
16
+
17
+ def convert_pad_shape(pad_shape):
18
+ l = pad_shape[::-1]
19
+ pad_shape = [item for sublist in l for item in sublist]
20
+ return pad_shape
21
+
22
+
23
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
24
+ """KL(P||Q)"""
25
+ kl = (logs_q - logs_p) - 0.5
26
+ kl += (
27
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
28
+ )
29
+ return kl
30
+
31
+
32
+ def rand_gumbel(shape):
33
+ """Sample from the Gumbel distribution, protect from overflows."""
34
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
35
+ return -torch.log(-torch.log(uniform_samples))
36
+
37
+
38
+ def rand_gumbel_like(x):
39
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
40
+ return g
41
+
42
+
43
+ def slice_segments(x, ids_str, segment_size=4):
44
+ ret = torch.zeros_like(x[:, :, :segment_size])
45
+ for i in range(x.size(0)):
46
+ idx_str = ids_str[i]
47
+ idx_end = idx_str + segment_size
48
+ ret[i] = x[i, :, idx_str:idx_end]
49
+ return ret
50
+
51
+
52
+ def slice_segments2(x, ids_str, segment_size=4):
53
+ ret = torch.zeros_like(x[:, :segment_size])
54
+ for i in range(x.size(0)):
55
+ idx_str = ids_str[i]
56
+ idx_end = idx_str + segment_size
57
+ ret[i] = x[i, idx_str:idx_end]
58
+ return ret
59
+
60
+
61
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
62
+ b, d, t = x.size()
63
+ if x_lengths is None:
64
+ x_lengths = t
65
+ ids_str_max = x_lengths - segment_size + 1
66
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
67
+ ret = slice_segments(x, ids_str, segment_size)
68
+ return ret, ids_str
69
+
70
+
71
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
72
+ position = torch.arange(length, dtype=torch.float)
73
+ num_timescales = channels // 2
74
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
75
+ num_timescales - 1
76
+ )
77
+ inv_timescales = min_timescale * torch.exp(
78
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
79
+ )
80
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
81
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
82
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
83
+ signal = signal.view(1, channels, length)
84
+ return signal
85
+
86
+
87
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
88
+ b, channels, length = x.size()
89
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
90
+ return x + signal.to(dtype=x.dtype, device=x.device)
91
+
92
+
93
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
94
+ b, channels, length = x.size()
95
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
96
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
97
+
98
+
99
+ def subsequent_mask(length):
100
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
101
+ return mask
102
+
103
+
104
+ @torch.jit.script
105
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
106
+ n_channels_int = n_channels[0]
107
+ in_act = input_a + input_b
108
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
109
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
110
+ acts = t_act * s_act
111
+ return acts
112
+
113
+
114
+ def convert_pad_shape(pad_shape):
115
+ l = pad_shape[::-1]
116
+ pad_shape = [item for sublist in l for item in sublist]
117
+ return pad_shape
118
+
119
+
120
+ def shift_1d(x):
121
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
122
+ return x
123
+
124
+
125
+ def sequence_mask(length, max_length=None):
126
+ if max_length is None:
127
+ max_length = length.max()
128
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
129
+ return x.unsqueeze(0) < length.unsqueeze(1)
130
+
131
+
132
+ def generate_path(duration, mask):
133
+ """
134
+ duration: [b, 1, t_x]
135
+ mask: [b, 1, t_y, t_x]
136
+ """
137
+ b, _, t_y, t_x = mask.shape
138
+ cum_duration = torch.cumsum(duration, -1)
139
+
140
+ cum_duration_flat = cum_duration.view(b * t_x)
141
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
142
+ path = path.view(b, t_x, t_y)
143
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
144
+ path = path.unsqueeze(1).transpose(2, 3) * mask
145
+ return path
146
+
147
+
148
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
149
+ if isinstance(parameters, torch.Tensor):
150
+ parameters = [parameters]
151
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
152
+ norm_type = float(norm_type)
153
+ if clip_value is not None:
154
+ clip_value = float(clip_value)
155
+
156
+ total_norm = 0
157
+ for p in parameters:
158
+ param_norm = p.grad.data.norm(norm_type)
159
+ total_norm += param_norm.item() ** norm_type
160
+ if clip_value is not None:
161
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
162
+ total_norm = total_norm ** (1.0 / norm_type)
163
+ return total_norm
lib/rvc/config.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import *
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class TrainConfigTrain(BaseModel):
7
+ log_interval: int
8
+ seed: int
9
+ epochs: int
10
+ learning_rate: float
11
+ betas: List[float]
12
+ eps: float
13
+ batch_size: int
14
+ fp16_run: bool
15
+ lr_decay: float
16
+ segment_size: int
17
+ init_lr_ratio: int
18
+ warmup_epochs: int
19
+ c_mel: int
20
+ c_kl: float
21
+
22
+
23
+ class TrainConfigData(BaseModel):
24
+ max_wav_value: float
25
+ sampling_rate: int
26
+ filter_length: int
27
+ hop_length: int
28
+ win_length: int
29
+ n_mel_channels: int
30
+ mel_fmin: float
31
+ mel_fmax: Any
32
+
33
+
34
+ class TrainConfigModel(BaseModel):
35
+ inter_channels: int
36
+ hidden_channels: int
37
+ filter_channels: int
38
+ n_heads: int
39
+ n_layers: int
40
+ kernel_size: int
41
+ p_dropout: int
42
+ resblock: str
43
+ resblock_kernel_sizes: List[int]
44
+ resblock_dilation_sizes: List[List[int]]
45
+ upsample_rates: List[int]
46
+ upsample_initial_channel: int
47
+ upsample_kernel_sizes: List[int]
48
+ use_spectral_norm: bool
49
+ gin_channels: int
50
+ emb_channels: int
51
+ spk_embed_dim: int
52
+
53
+
54
+ class TrainConfig(BaseModel):
55
+ version: Literal["v1", "v2"] = "v2"
56
+ train: TrainConfigTrain
57
+ data: TrainConfigData
58
+ model: TrainConfigModel
59
+
60
+
61
+ class DatasetMetaItem(BaseModel):
62
+ gt_wav: str
63
+ co256: str
64
+ f0: Optional[str]
65
+ f0nsf: Optional[str]
66
+ speaker_id: int
67
+
68
+
69
+ class DatasetMetadata(BaseModel):
70
+ files: Dict[str, DatasetMetaItem]
71
+ # mute: DatasetMetaItem
lib/rvc/data_utils.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+
8
+ from .config import DatasetMetadata, DatasetMetaItem, TrainConfigData
9
+ from .mel_processing import spectrogram_torch
10
+ from .utils import load_wav_to_torch
11
+
12
+
13
+ class TextAudioLoader(torch.utils.data.Dataset):
14
+ """
15
+ 1) loads audio, text pairs
16
+ 2) normalizes text and converts them to sequences of integers
17
+ 3) computes spectrograms from audio files.
18
+ """
19
+
20
+ def __init__(self, dataset_meta: DatasetMetadata, data: TrainConfigData):
21
+ self.dataset_meta = dataset_meta
22
+ self.max_wav_value = data.max_wav_value
23
+ self.sampling_rate = data.sampling_rate
24
+ self.filter_length = data.filter_length
25
+ self.hop_length = data.hop_length
26
+ self.win_length = data.win_length
27
+ self.sampling_rate = data.sampling_rate
28
+ self.min_text_len = getattr(data, "min_text_len", 1)
29
+ self.max_text_len = getattr(data, "max_text_len", 5000)
30
+ self._filter()
31
+
32
+ def _filter(self):
33
+ """
34
+ Filter text & store spec lengths
35
+ """
36
+ # Store spectrogram lengths for Bucketing
37
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
38
+ # spec_length = wav_length // hop_length
39
+ lengths = []
40
+ for key, data in self.dataset_meta.files.items():
41
+ if (
42
+ self.min_text_len <= len(data.co256)
43
+ and len(data.co256) <= self.max_text_len
44
+ ):
45
+ lengths.append(os.path.getsize(data.gt_wav) // (2 * self.hop_length))
46
+ else:
47
+ del self.dataset_meta.files[key]
48
+ self.lengths = lengths
49
+
50
+ def get_sid(self, sid):
51
+ sid = torch.LongTensor([int(sid)])
52
+ return sid
53
+
54
+ def get_audio_text_pair(self, data: DatasetMetaItem):
55
+ # separate filename and text
56
+ file = data.gt_wav
57
+ phone = data.co256
58
+ dv = data.speaker_id
59
+
60
+ phone = self.get_labels(phone)
61
+ spec, wav = self.get_audio(file)
62
+ dv = self.get_sid(dv)
63
+
64
+ len_phone = phone.size()[0]
65
+ len_spec = spec.size()[-1]
66
+ if len_phone != len_spec:
67
+ len_min = min(len_phone, len_spec)
68
+ len_wav = len_min * self.hop_length
69
+ spec = spec[:, :len_min]
70
+ wav = wav[:, :len_wav]
71
+ phone = phone[:len_min, :]
72
+ return (spec, wav, phone, dv)
73
+
74
+ def get_labels(self, phone):
75
+ phone = np.load(phone)
76
+ phone = np.repeat(phone, 2, axis=0)
77
+ n_num = min(phone.shape[0], 900) # DistributedBucketSampler
78
+ phone = phone[:n_num, :]
79
+ phone = torch.FloatTensor(phone)
80
+ return phone
81
+
82
+ def get_audio(self, filename):
83
+ audio, sampling_rate = load_wav_to_torch(filename)
84
+ if sampling_rate != self.sampling_rate:
85
+ raise ValueError(
86
+ "{} SR doesn't match target {} SR".format(
87
+ sampling_rate, self.sampling_rate
88
+ )
89
+ )
90
+ # audio_norm = audio / self.max_wav_value
91
+ audio_norm = audio.unsqueeze(0)
92
+ spec_filename = filename.replace(".wav", ".spec.pt")
93
+ if os.path.exists(spec_filename):
94
+ try:
95
+ spec = torch.load(spec_filename)
96
+ except:
97
+ print(spec_filename, traceback.format_exc())
98
+ spec = spectrogram_torch(
99
+ audio_norm,
100
+ self.filter_length,
101
+ self.sampling_rate,
102
+ self.hop_length,
103
+ self.win_length,
104
+ center=False,
105
+ )
106
+ spec = torch.squeeze(spec, 0)
107
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
108
+ else:
109
+ spec = spectrogram_torch(
110
+ audio_norm,
111
+ self.filter_length,
112
+ self.sampling_rate,
113
+ self.hop_length,
114
+ self.win_length,
115
+ center=False,
116
+ )
117
+ spec = torch.squeeze(spec, 0)
118
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
119
+ return spec, audio_norm
120
+
121
+ def __getitem__(self, index):
122
+ _, data = list(self.dataset_meta.files.items())[index]
123
+ return self.get_audio_text_pair(data)
124
+
125
+ def __len__(self):
126
+ return len(self.dataset_meta.files)
127
+
128
+
129
+ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
130
+ """
131
+ 1) loads audio, text pairs
132
+ 2) normalizes text and converts them to sequences of integers
133
+ 3) computes spectrograms from audio files.
134
+ """
135
+
136
+ def __init__(self, dataset_meta: DatasetMetadata, data: TrainConfigData):
137
+ self.dataset_meta = dataset_meta
138
+ self.max_wav_value = data.max_wav_value
139
+ self.sampling_rate = data.sampling_rate
140
+ self.filter_length = data.filter_length
141
+ self.hop_length = data.hop_length
142
+ self.win_length = data.win_length
143
+ self.sampling_rate = data.sampling_rate
144
+ self.min_text_len = getattr(data, "min_text_len", 1)
145
+ self.max_text_len = getattr(data, "max_text_len", 5000)
146
+ self._filter()
147
+
148
+ def _filter(self):
149
+ """
150
+ Filter text & store spec lengths
151
+ """
152
+ # Store spectrogram lengths for Bucketing
153
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
154
+ # spec_length = wav_length // hop_length
155
+ lengths = []
156
+ for key, data in self.dataset_meta.files.items():
157
+ if (
158
+ self.min_text_len <= len(data.co256)
159
+ and len(data.co256) <= self.max_text_len
160
+ ):
161
+ lengths.append(os.path.getsize(data.gt_wav) // (2 * self.hop_length))
162
+ else:
163
+ del self.dataset_meta.files[key]
164
+ self.lengths = lengths
165
+
166
+ def get_sid(self, sid):
167
+ sid = torch.LongTensor([int(sid)])
168
+ return sid
169
+
170
+ def get_audio_text_pair(self, data: DatasetMetaItem):
171
+ # separate filename and text
172
+ file = data.gt_wav
173
+ phone = data.co256
174
+ pitch = data.f0
175
+ pitchf = data.f0nsf
176
+ dv = data.speaker_id
177
+
178
+ phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
179
+ spec, wav = self.get_audio(file)
180
+ dv = self.get_sid(dv)
181
+
182
+ len_phone = phone.size()[0]
183
+ len_spec = spec.size()[-1]
184
+ # print(123,phone.shape,pitch.shape,spec.shape)
185
+ if len_phone != len_spec:
186
+ len_min = min(len_phone, len_spec)
187
+ # amor
188
+ len_wav = len_min * self.hop_length
189
+
190
+ spec = spec[:, :len_min]
191
+ wav = wav[:, :len_wav]
192
+
193
+ phone = phone[:len_min, :]
194
+ pitch = pitch[:len_min]
195
+ pitchf = pitchf[:len_min]
196
+
197
+ return (spec, wav, phone, pitch, pitchf, dv)
198
+
199
+ def get_labels(self, phone, pitch, pitchf):
200
+ phone = np.load(phone)
201
+ phone = np.repeat(phone, 2, axis=0)
202
+ pitch = np.load(pitch)
203
+ pitchf = np.load(pitchf)
204
+ n_num = min(phone.shape[0], 900) # DistributedBucketSampler
205
+ # print(234,phone.shape,pitch.shape)
206
+ phone = phone[:n_num, :]
207
+ pitch = pitch[:n_num]
208
+ pitchf = pitchf[:n_num]
209
+ phone = torch.FloatTensor(phone)
210
+ pitch = torch.LongTensor(pitch)
211
+ pitchf = torch.FloatTensor(pitchf)
212
+ return phone, pitch, pitchf
213
+
214
+ def get_audio(self, filename):
215
+ audio, sampling_rate = load_wav_to_torch(filename)
216
+ if sampling_rate != self.sampling_rate:
217
+ raise ValueError(
218
+ "{} SR doesn't match target {} SR".format(
219
+ sampling_rate, self.sampling_rate
220
+ )
221
+ )
222
+ # audio_norm = audio / self.max_wav_value
223
+ audio_norm = audio.unsqueeze(0)
224
+ spec_filename = filename.replace(".wav", ".spec.pt")
225
+ if os.path.exists(spec_filename):
226
+ try:
227
+ spec = torch.load(spec_filename)
228
+ except:
229
+ print(spec_filename, traceback.format_exc())
230
+ spec = spectrogram_torch(
231
+ audio_norm,
232
+ self.filter_length,
233
+ self.sampling_rate,
234
+ self.hop_length,
235
+ self.win_length,
236
+ center=False,
237
+ )
238
+ spec = torch.squeeze(spec, 0)
239
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
240
+ else:
241
+ spec = spectrogram_torch(
242
+ audio_norm,
243
+ self.filter_length,
244
+ self.sampling_rate,
245
+ self.hop_length,
246
+ self.win_length,
247
+ center=False,
248
+ )
249
+ spec = torch.squeeze(spec, 0)
250
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
251
+ return spec, audio_norm
252
+
253
+ def __getitem__(self, index):
254
+ _, data = list(self.dataset_meta.files.items())[index]
255
+ return self.get_audio_text_pair(data)
256
+
257
+ def __len__(self):
258
+ return len(self.dataset_meta.files)
259
+
260
+
261
+ class TextAudioCollateMultiNSFsid:
262
+ """Zero-pads model inputs and targets"""
263
+
264
+ def __init__(self, return_ids=False):
265
+ self.return_ids = return_ids
266
+
267
+ def __call__(self, batch):
268
+ """Collate's training batch from normalized text and aduio
269
+ PARAMS
270
+ ------
271
+ batch: [text_normalized, spec_normalized, wav_normalized]
272
+ """
273
+ # Right zero-pad all one-hot text sequences to max input length
274
+ _, ids_sorted_decreasing = torch.sort(
275
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
276
+ )
277
+
278
+ max_spec_len = max([x[0].size(1) for x in batch])
279
+ max_wave_len = max([x[1].size(1) for x in batch])
280
+ spec_lengths = torch.LongTensor(len(batch))
281
+ wave_lengths = torch.LongTensor(len(batch))
282
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
283
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
284
+ spec_padded.zero_()
285
+ wave_padded.zero_()
286
+
287
+ max_phone_len = max([x[2].size(0) for x in batch])
288
+ phone_lengths = torch.LongTensor(len(batch))
289
+ phone_padded = torch.FloatTensor(
290
+ len(batch), max_phone_len, batch[0][2].shape[1]
291
+ ) # (spec, wav, phone, pitch)
292
+ pitch_padded = torch.LongTensor(len(batch), max_phone_len)
293
+ pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
294
+ phone_padded.zero_()
295
+ pitch_padded.zero_()
296
+ pitchf_padded.zero_()
297
+ # dv = torch.FloatTensor(len(batch), 256)#gin=256
298
+ sid = torch.LongTensor(len(batch))
299
+
300
+ for i in range(len(ids_sorted_decreasing)):
301
+ row = batch[ids_sorted_decreasing[i]]
302
+
303
+ spec = row[0]
304
+ spec_padded[i, :, : spec.size(1)] = spec
305
+ spec_lengths[i] = spec.size(1)
306
+
307
+ wave = row[1]
308
+ wave_padded[i, :, : wave.size(1)] = wave
309
+ wave_lengths[i] = wave.size(1)
310
+
311
+ phone = row[2]
312
+ phone_padded[i, : phone.size(0), :] = phone
313
+ phone_lengths[i] = phone.size(0)
314
+
315
+ pitch = row[3]
316
+ pitch_padded[i, : pitch.size(0)] = pitch
317
+ pitchf = row[4]
318
+ pitchf_padded[i, : pitchf.size(0)] = pitchf
319
+
320
+ # dv[i] = row[5]
321
+ sid[i] = row[5]
322
+
323
+ return (
324
+ phone_padded,
325
+ phone_lengths,
326
+ pitch_padded,
327
+ pitchf_padded,
328
+ spec_padded,
329
+ spec_lengths,
330
+ wave_padded,
331
+ wave_lengths,
332
+ # dv
333
+ sid,
334
+ )
335
+
336
+
337
+ class TextAudioCollate:
338
+ """Zero-pads model inputs and targets"""
339
+
340
+ def __init__(self, return_ids=False):
341
+ self.return_ids = return_ids
342
+
343
+ def __call__(self, batch):
344
+ """Collate's training batch from normalized text and aduio
345
+ PARAMS
346
+ ------
347
+ batch: [text_normalized, spec_normalized, wav_normalized]
348
+ """
349
+ # Right zero-pad all one-hot text sequences to max input length
350
+ _, ids_sorted_decreasing = torch.sort(
351
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
352
+ )
353
+
354
+ max_spec_len = max([x[0].size(1) for x in batch])
355
+ max_wave_len = max([x[1].size(1) for x in batch])
356
+ spec_lengths = torch.LongTensor(len(batch))
357
+ wave_lengths = torch.LongTensor(len(batch))
358
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
359
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
360
+ spec_padded.zero_()
361
+ wave_padded.zero_()
362
+
363
+ max_phone_len = max([x[2].size(0) for x in batch])
364
+ phone_lengths = torch.LongTensor(len(batch))
365
+ phone_padded = torch.FloatTensor(
366
+ len(batch), max_phone_len, batch[0][2].shape[1]
367
+ )
368
+ phone_padded.zero_()
369
+ sid = torch.LongTensor(len(batch))
370
+
371
+ for i in range(len(ids_sorted_decreasing)):
372
+ row = batch[ids_sorted_decreasing[i]]
373
+
374
+ spec = row[0]
375
+ spec_padded[i, :, : spec.size(1)] = spec
376
+ spec_lengths[i] = spec.size(1)
377
+
378
+ wave = row[1]
379
+ wave_padded[i, :, : wave.size(1)] = wave
380
+ wave_lengths[i] = wave.size(1)
381
+
382
+ phone = row[2]
383
+ phone_padded[i, : phone.size(0), :] = phone
384
+ phone_lengths[i] = phone.size(0)
385
+
386
+ sid[i] = row[3]
387
+
388
+ return (
389
+ phone_padded,
390
+ phone_lengths,
391
+ spec_padded,
392
+ spec_lengths,
393
+ wave_padded,
394
+ wave_lengths,
395
+ sid,
396
+ )
397
+
398
+
399
+ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
400
+ """
401
+ Maintain similar input lengths in a batch.
402
+ Length groups are specified by boundaries.
403
+ Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
404
+
405
+ It removes samples which are not included in the boundaries.
406
+ Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
407
+ """
408
+
409
+ def __init__(
410
+ self,
411
+ dataset,
412
+ batch_size,
413
+ boundaries,
414
+ num_replicas=None,
415
+ rank=None,
416
+ shuffle=True,
417
+ ):
418
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
419
+ self.lengths = dataset.lengths
420
+ self.batch_size = batch_size
421
+ self.boundaries = boundaries
422
+
423
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
424
+ self.total_size = sum(self.num_samples_per_bucket)
425
+ self.num_samples = self.total_size // self.num_replicas
426
+
427
+ def _create_buckets(self):
428
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
429
+ for i in range(len(self.lengths)):
430
+ length = self.lengths[i]
431
+ idx_bucket = self._bisect(length)
432
+ if idx_bucket != -1:
433
+ buckets[idx_bucket].append(i)
434
+
435
+ for i in range(len(buckets) - 1, -1, -1): #
436
+ if len(buckets[i]) == 0:
437
+ buckets.pop(i)
438
+ self.boundaries.pop(i + 1)
439
+
440
+ num_samples_per_bucket = []
441
+ for i in range(len(buckets)):
442
+ len_bucket = len(buckets[i])
443
+ total_batch_size = self.num_replicas * self.batch_size
444
+ rem = (
445
+ total_batch_size - (len_bucket % total_batch_size)
446
+ ) % total_batch_size
447
+ num_samples_per_bucket.append(len_bucket + rem)
448
+ return buckets, num_samples_per_bucket
449
+
450
+ def __iter__(self):
451
+ # deterministically shuffle based on epoch
452
+ g = torch.Generator()
453
+ g.manual_seed(self.epoch)
454
+
455
+ indices = []
456
+ if self.shuffle:
457
+ for bucket in self.buckets:
458
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
459
+ else:
460
+ for bucket in self.buckets:
461
+ indices.append(list(range(len(bucket))))
462
+
463
+ batches = []
464
+ for i in range(len(self.buckets)):
465
+ bucket = self.buckets[i]
466
+ len_bucket = len(bucket)
467
+ ids_bucket = indices[i]
468
+ num_samples_bucket = self.num_samples_per_bucket[i]
469
+
470
+ # add extra samples to make it evenly divisible
471
+ rem = num_samples_bucket - len_bucket
472
+ ids_bucket = (
473
+ ids_bucket
474
+ + ids_bucket * (rem // len_bucket)
475
+ + ids_bucket[: (rem % len_bucket)]
476
+ )
477
+
478
+ # subsample
479
+ ids_bucket = ids_bucket[self.rank :: self.num_replicas]
480
+
481
+ # batching
482
+ for j in range(len(ids_bucket) // self.batch_size):
483
+ batch = [
484
+ bucket[idx]
485
+ for idx in ids_bucket[
486
+ j * self.batch_size : (j + 1) * self.batch_size
487
+ ]
488
+ ]
489
+ batches.append(batch)
490
+
491
+ if self.shuffle:
492
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
493
+ batches = [batches[i] for i in batch_ids]
494
+ self.batches = batches
495
+
496
+ assert len(self.batches) * self.batch_size == self.num_samples
497
+ return iter(self.batches)
498
+
499
+ def _bisect(self, x, lo=0, hi=None):
500
+ if hi is None:
501
+ hi = len(self.boundaries) - 1
502
+
503
+ if hi > lo:
504
+ mid = (hi + lo) // 2
505
+ if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
506
+ return mid
507
+ elif x <= self.boundaries[mid]:
508
+ return self._bisect(x, lo, mid)
509
+ else:
510
+ return self._bisect(x, mid + 1, hi)
511
+ else:
512
+ return -1
513
+
514
+ def __len__(self):
515
+ return self.num_samples // self.batch_size
lib/rvc/losses.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def feature_loss(fmap_r, fmap_g):
5
+ loss = 0
6
+ for dr, dg in zip(fmap_r, fmap_g):
7
+ for rl, gl in zip(dr, dg):
8
+ rl = rl.float().detach()
9
+ gl = gl.float()
10
+ loss += torch.mean(torch.abs(rl - gl))
11
+
12
+ return loss * 2
13
+
14
+
15
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16
+ loss = 0
17
+ r_losses = []
18
+ g_losses = []
19
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20
+ dr = dr.float()
21
+ dg = dg.float()
22
+ r_loss = torch.mean((1 - dr) ** 2)
23
+ g_loss = torch.mean(dg**2)
24
+ loss += r_loss + g_loss
25
+ r_losses.append(r_loss.item())
26
+ g_losses.append(g_loss.item())
27
+
28
+ return loss, r_losses, g_losses
29
+
30
+
31
+ def generator_loss(disc_outputs):
32
+ loss = 0
33
+ gen_losses = []
34
+ for dg in disc_outputs:
35
+ dg = dg.float()
36
+ l = torch.mean((1 - dg) ** 2)
37
+ gen_losses.append(l)
38
+ loss += l
39
+
40
+ return loss, gen_losses
41
+
42
+
43
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44
+ """
45
+ z_p, logs_q: [b, h, t_t]
46
+ m_p, logs_p: [b, h, t_t]
47
+ """
48
+ z_p = z_p.float()
49
+ logs_q = logs_q.float()
50
+ m_p = m_p.float()
51
+ logs_p = logs_p.float()
52
+ z_mask = z_mask.float()
53
+
54
+ kl = logs_p - logs_q - 0.5
55
+ kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56
+ kl = torch.sum(kl * z_mask)
57
+ l = kl / torch.sum(z_mask)
58
+ return l
lib/rvc/mel_processing.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.data
3
+ from librosa.filters import mel as librosa_mel_fn
4
+
5
+ MAX_WAV_VALUE = 32768.0
6
+
7
+
8
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
9
+ """
10
+ PARAMS
11
+ ------
12
+ C: compression factor
13
+ """
14
+ return torch.log(torch.clamp(x, min=clip_val) * C)
15
+
16
+
17
+ def dynamic_range_decompression_torch(x, C=1):
18
+ """
19
+ PARAMS
20
+ ------
21
+ C: compression factor used to compress
22
+ """
23
+ return torch.exp(x) / C
24
+
25
+
26
+ def spectral_normalize_torch(magnitudes):
27
+ return dynamic_range_compression_torch(magnitudes)
28
+
29
+
30
+ def spectral_de_normalize_torch(magnitudes):
31
+ return dynamic_range_decompression_torch(magnitudes)
32
+
33
+
34
+ mel_basis = {}
35
+ hann_window = {}
36
+
37
+
38
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
39
+ if torch.min(y) < -1.07:
40
+ print("min value is ", torch.min(y))
41
+ if torch.max(y) > 1.07:
42
+ print("max value is ", torch.max(y))
43
+
44
+ global hann_window
45
+ dtype_device = str(y.dtype) + "_" + str(y.device)
46
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
47
+ if wnsize_dtype_device not in hann_window:
48
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
49
+ dtype=y.dtype, device=y.device
50
+ )
51
+
52
+ y = torch.nn.functional.pad(
53
+ y.unsqueeze(1),
54
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
55
+ mode="reflect",
56
+ )
57
+ y = y.squeeze(1)
58
+
59
+ # mps does not support torch.stft.
60
+ if y.device.type == "mps":
61
+ i = y.cpu()
62
+ win = hann_window[wnsize_dtype_device].cpu()
63
+ else:
64
+ i = y
65
+ win = hann_window[wnsize_dtype_device]
66
+ spec = torch.stft(
67
+ i,
68
+ n_fft,
69
+ hop_length=hop_size,
70
+ win_length=win_size,
71
+ window=win,
72
+ center=center,
73
+ pad_mode="reflect",
74
+ normalized=False,
75
+ onesided=True,
76
+ return_complex=False,
77
+ ).to(device=y.device)
78
+
79
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
80
+ return spec
81
+
82
+
83
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
84
+ global mel_basis
85
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
86
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
87
+ if fmax_dtype_device not in mel_basis:
88
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
89
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
90
+ dtype=spec.dtype, device=spec.device
91
+ )
92
+ melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
93
+ melspec = spectral_normalize_torch(melspec)
94
+ return melspec
95
+
96
+
97
+ def mel_spectrogram_torch(
98
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
99
+ ):
100
+ """Convert waveform into Mel-frequency Log-amplitude spectrogram.
101
+
102
+ Args:
103
+ y :: (B, T) - Waveforms
104
+ Returns:
105
+ melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
106
+ """
107
+ # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
108
+ spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
109
+
110
+ # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
111
+ melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
112
+
113
+ return melspec
lib/rvc/models.py ADDED
@@ -0,0 +1,853 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import Conv1d, Conv2d, ConvTranspose1d
7
+ from torch.nn import functional as F
8
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
9
+
10
+ from . import attentions, commons, modules
11
+ from .commons import get_padding, init_weights
12
+
13
+
14
+ class TextEncoder(nn.Module):
15
+ def __init__(
16
+ self,
17
+ out_channels: int,
18
+ hidden_channels: int,
19
+ filter_channels: int,
20
+ emb_channels: int,
21
+ n_heads: int,
22
+ n_layers: int,
23
+ kernel_size: int,
24
+ p_dropout: int,
25
+ f0: bool = True,
26
+ ):
27
+ super().__init__()
28
+ self.out_channels = out_channels
29
+ self.hidden_channels = hidden_channels
30
+ self.filter_channels = filter_channels
31
+ self.emb_channels = emb_channels
32
+ self.n_heads = n_heads
33
+ self.n_layers = n_layers
34
+ self.kernel_size = kernel_size
35
+ self.p_dropout = p_dropout
36
+ self.emb_phone = nn.Linear(emb_channels, hidden_channels)
37
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
38
+ if f0 == True:
39
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
40
+ self.encoder = attentions.Encoder(
41
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
42
+ )
43
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
44
+
45
+ def forward(self, phone, pitch, lengths):
46
+ if pitch == None:
47
+ x = self.emb_phone(phone)
48
+ else:
49
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
50
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
51
+ x = self.lrelu(x)
52
+ x = torch.transpose(x, 1, -1) # [b, h, t]
53
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
54
+ x.dtype
55
+ )
56
+ x = self.encoder(x * x_mask, x_mask)
57
+ stats = self.proj(x) * x_mask
58
+
59
+ m, logs = torch.split(stats, self.out_channels, dim=1)
60
+ return m, logs, x_mask
61
+
62
+
63
+ class ResidualCouplingBlock(nn.Module):
64
+ def __init__(
65
+ self,
66
+ channels,
67
+ hidden_channels,
68
+ kernel_size,
69
+ dilation_rate,
70
+ n_layers,
71
+ n_flows=4,
72
+ gin_channels=0,
73
+ ):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.hidden_channels = hidden_channels
77
+ self.kernel_size = kernel_size
78
+ self.dilation_rate = dilation_rate
79
+ self.n_layers = n_layers
80
+ self.n_flows = n_flows
81
+ self.gin_channels = gin_channels
82
+
83
+ self.flows = nn.ModuleList()
84
+ for i in range(n_flows):
85
+ self.flows.append(
86
+ modules.ResidualCouplingLayer(
87
+ channels,
88
+ hidden_channels,
89
+ kernel_size,
90
+ dilation_rate,
91
+ n_layers,
92
+ gin_channels=gin_channels,
93
+ mean_only=True,
94
+ )
95
+ )
96
+ self.flows.append(modules.Flip())
97
+
98
+ def forward(self, x, x_mask, g=None, reverse=False):
99
+ if not reverse:
100
+ for flow in self.flows:
101
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
102
+ else:
103
+ for flow in reversed(self.flows):
104
+ x = flow(x, x_mask, g=g, reverse=reverse)
105
+ return x
106
+
107
+ def remove_weight_norm(self):
108
+ for i in range(self.n_flows):
109
+ self.flows[i * 2].remove_weight_norm()
110
+
111
+
112
+ class PosteriorEncoder(nn.Module):
113
+ def __init__(
114
+ self,
115
+ in_channels,
116
+ out_channels,
117
+ hidden_channels,
118
+ kernel_size,
119
+ dilation_rate,
120
+ n_layers,
121
+ gin_channels=0,
122
+ ):
123
+ super().__init__()
124
+ self.in_channels = in_channels
125
+ self.out_channels = out_channels
126
+ self.hidden_channels = hidden_channels
127
+ self.kernel_size = kernel_size
128
+ self.dilation_rate = dilation_rate
129
+ self.n_layers = n_layers
130
+ self.gin_channels = gin_channels
131
+
132
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
133
+ self.enc = modules.WN(
134
+ hidden_channels,
135
+ kernel_size,
136
+ dilation_rate,
137
+ n_layers,
138
+ gin_channels=gin_channels,
139
+ )
140
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
141
+
142
+ def forward(self, x, x_lengths, g=None):
143
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
144
+ x.dtype
145
+ )
146
+ x = self.pre(x) * x_mask
147
+ x = self.enc(x, x_mask, g=g)
148
+ stats = self.proj(x) * x_mask
149
+ m, logs = torch.split(stats, self.out_channels, dim=1)
150
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
151
+ return z, m, logs, x_mask
152
+
153
+ def remove_weight_norm(self):
154
+ self.enc.remove_weight_norm()
155
+
156
+
157
+ class Generator(torch.nn.Module):
158
+ def __init__(
159
+ self,
160
+ initial_channel,
161
+ resblock,
162
+ resblock_kernel_sizes,
163
+ resblock_dilation_sizes,
164
+ upsample_rates,
165
+ upsample_initial_channel,
166
+ upsample_kernel_sizes,
167
+ gin_channels=0,
168
+ ):
169
+ super(Generator, self).__init__()
170
+ self.num_kernels = len(resblock_kernel_sizes)
171
+ self.num_upsamples = len(upsample_rates)
172
+ self.conv_pre = Conv1d(
173
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
174
+ )
175
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
176
+
177
+ self.ups = nn.ModuleList()
178
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
179
+ self.ups.append(
180
+ weight_norm(
181
+ ConvTranspose1d(
182
+ upsample_initial_channel // (2**i),
183
+ upsample_initial_channel // (2 ** (i + 1)),
184
+ k,
185
+ u,
186
+ padding=(k - u) // 2,
187
+ )
188
+ )
189
+ )
190
+
191
+ self.resblocks = nn.ModuleList()
192
+ for i in range(len(self.ups)):
193
+ ch = upsample_initial_channel // (2 ** (i + 1))
194
+ for j, (k, d) in enumerate(
195
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
196
+ ):
197
+ self.resblocks.append(resblock(ch, k, d))
198
+
199
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
200
+ self.ups.apply(init_weights)
201
+
202
+ if gin_channels != 0:
203
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
204
+
205
+ def forward(self, x, g=None):
206
+ x = self.conv_pre(x)
207
+ if g is not None:
208
+ x = x + self.cond(g)
209
+
210
+ for i in range(self.num_upsamples):
211
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
212
+ x = self.ups[i](x)
213
+ xs = None
214
+ for j in range(self.num_kernels):
215
+ if xs is None:
216
+ xs = self.resblocks[i * self.num_kernels + j](x)
217
+ else:
218
+ xs += self.resblocks[i * self.num_kernels + j](x)
219
+ x = xs / self.num_kernels
220
+ x = F.leaky_relu(x)
221
+ x = self.conv_post(x)
222
+ x = torch.tanh(x)
223
+
224
+ return x
225
+
226
+ def remove_weight_norm(self):
227
+ for l in self.ups:
228
+ remove_weight_norm(l)
229
+ for l in self.resblocks:
230
+ l.remove_weight_norm()
231
+
232
+
233
+ class SineGen(torch.nn.Module):
234
+ """Definition of sine generator
235
+ SineGen(samp_rate, harmonic_num = 0,
236
+ sine_amp = 0.1, noise_std = 0.003,
237
+ voiced_threshold = 0,
238
+ flag_for_pulse=False)
239
+ samp_rate: sampling rate in Hz
240
+ harmonic_num: number of harmonic overtones (default 0)
241
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
242
+ noise_std: std of Gaussian noise (default 0.003)
243
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
244
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
245
+ Note: when flag_for_pulse is True, the first time step of a voiced
246
+ segment is always sin(np.pi) or cos(0)
247
+ """
248
+
249
+ def __init__(
250
+ self,
251
+ samp_rate,
252
+ harmonic_num=0,
253
+ sine_amp=0.1,
254
+ noise_std=0.003,
255
+ voiced_threshold=0,
256
+ flag_for_pulse=False,
257
+ ):
258
+ super(SineGen, self).__init__()
259
+ self.sine_amp = sine_amp
260
+ self.noise_std = noise_std
261
+ self.harmonic_num = harmonic_num
262
+ self.dim = self.harmonic_num + 1
263
+ self.sampling_rate = samp_rate
264
+ self.voiced_threshold = voiced_threshold
265
+
266
+ def _f02uv(self, f0):
267
+ # generate uv signal
268
+ uv = torch.ones_like(f0)
269
+ uv = uv * (f0 > self.voiced_threshold)
270
+ return uv
271
+
272
+ def forward(self, f0, upp):
273
+ """sine_tensor, uv = forward(f0)
274
+ input F0: tensor(batchsize=1, length, dim=1)
275
+ f0 for unvoiced steps should be 0
276
+ output sine_tensor: tensor(batchsize=1, length, dim)
277
+ output uv: tensor(batchsize=1, length, 1)
278
+ """
279
+ with torch.no_grad():
280
+ f0 = f0[:, None].transpose(1, 2)
281
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
282
+ # fundamental component
283
+ f0_buf[:, :, 0] = f0[:, :, 0]
284
+ for idx in np.arange(self.harmonic_num):
285
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
286
+ idx + 2
287
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
288
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
289
+ rand_ini = torch.rand(
290
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
291
+ )
292
+ rand_ini[:, 0] = 0
293
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
294
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
295
+ tmp_over_one *= upp
296
+ tmp_over_one = F.interpolate(
297
+ tmp_over_one.transpose(2, 1),
298
+ scale_factor=upp,
299
+ mode="linear",
300
+ align_corners=True,
301
+ ).transpose(2, 1)
302
+ rad_values = F.interpolate(
303
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
304
+ ).transpose(
305
+ 2, 1
306
+ ) #######
307
+ tmp_over_one %= 1
308
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
309
+ cumsum_shift = torch.zeros_like(rad_values)
310
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
311
+ sine_waves = torch.sin(
312
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
313
+ )
314
+ sine_waves = sine_waves * self.sine_amp
315
+ uv = self._f02uv(f0)
316
+ uv = F.interpolate(
317
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
318
+ ).transpose(2, 1)
319
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
320
+ noise = noise_amp * torch.randn_like(sine_waves)
321
+ sine_waves = sine_waves * uv + noise
322
+ return sine_waves, uv, noise
323
+
324
+
325
+ class SourceModuleHnNSF(torch.nn.Module):
326
+ """SourceModule for hn-nsf
327
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
328
+ add_noise_std=0.003, voiced_threshod=0)
329
+ sampling_rate: sampling_rate in Hz
330
+ harmonic_num: number of harmonic above F0 (default: 0)
331
+ sine_amp: amplitude of sine source signal (default: 0.1)
332
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
333
+ note that amplitude of noise in unvoiced is decided
334
+ by sine_amp
335
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
336
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
337
+ F0_sampled (batchsize, length, 1)
338
+ Sine_source (batchsize, length, 1)
339
+ noise_source (batchsize, length 1)
340
+ uv (batchsize, length, 1)
341
+ """
342
+
343
+ def __init__(
344
+ self,
345
+ sampling_rate,
346
+ harmonic_num=0,
347
+ sine_amp=0.1,
348
+ add_noise_std=0.003,
349
+ voiced_threshod=0,
350
+ is_half=True,
351
+ ):
352
+ super(SourceModuleHnNSF, self).__init__()
353
+
354
+ self.sine_amp = sine_amp
355
+ self.noise_std = add_noise_std
356
+ self.is_half = is_half
357
+ # to produce sine waveforms
358
+ self.l_sin_gen = SineGen(
359
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
360
+ )
361
+
362
+ # to merge source harmonics into a single excitation
363
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
364
+ self.l_tanh = torch.nn.Tanh()
365
+
366
+ def forward(self, x, upp=None):
367
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
368
+ if self.is_half == True:
369
+ sine_wavs = sine_wavs.half()
370
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
371
+ return sine_merge, None, None # noise, uv
372
+
373
+
374
+ class GeneratorNSF(torch.nn.Module):
375
+ def __init__(
376
+ self,
377
+ initial_channel,
378
+ resblock,
379
+ resblock_kernel_sizes,
380
+ resblock_dilation_sizes,
381
+ upsample_rates,
382
+ upsample_initial_channel,
383
+ upsample_kernel_sizes,
384
+ gin_channels,
385
+ sr,
386
+ is_half=False,
387
+ ):
388
+ super(GeneratorNSF, self).__init__()
389
+ self.num_kernels = len(resblock_kernel_sizes)
390
+ self.num_upsamples = len(upsample_rates)
391
+
392
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
393
+ self.m_source = SourceModuleHnNSF(
394
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
395
+ )
396
+ self.noise_convs = nn.ModuleList()
397
+ self.conv_pre = Conv1d(
398
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
399
+ )
400
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
401
+
402
+ self.ups = nn.ModuleList()
403
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
404
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
405
+ self.ups.append(
406
+ weight_norm(
407
+ ConvTranspose1d(
408
+ upsample_initial_channel // (2**i),
409
+ upsample_initial_channel // (2 ** (i + 1)),
410
+ k,
411
+ u,
412
+ padding=(k - u) // 2,
413
+ )
414
+ )
415
+ )
416
+ if i + 1 < len(upsample_rates):
417
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
418
+ self.noise_convs.append(
419
+ Conv1d(
420
+ 1,
421
+ c_cur,
422
+ kernel_size=stride_f0 * 2,
423
+ stride=stride_f0,
424
+ padding=stride_f0 // 2,
425
+ )
426
+ )
427
+ else:
428
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
429
+
430
+ self.resblocks = nn.ModuleList()
431
+ for i in range(len(self.ups)):
432
+ ch = upsample_initial_channel // (2 ** (i + 1))
433
+ for j, (k, d) in enumerate(
434
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
435
+ ):
436
+ self.resblocks.append(resblock(ch, k, d))
437
+
438
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
439
+ self.ups.apply(init_weights)
440
+
441
+ if gin_channels != 0:
442
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
443
+
444
+ self.upp = np.prod(upsample_rates)
445
+
446
+ def forward(self, x, f0, g=None):
447
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
448
+ har_source = har_source.transpose(1, 2)
449
+ x = self.conv_pre(x)
450
+ if g is not None:
451
+ x = x + self.cond(g)
452
+
453
+ for i in range(self.num_upsamples):
454
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
455
+ x = self.ups[i](x)
456
+ x_source = self.noise_convs[i](har_source)
457
+ x = x + x_source
458
+ xs = None
459
+ for j in range(self.num_kernels):
460
+ if xs is None:
461
+ xs = self.resblocks[i * self.num_kernels + j](x)
462
+ else:
463
+ xs += self.resblocks[i * self.num_kernels + j](x)
464
+ x = xs / self.num_kernels
465
+ x = F.leaky_relu(x)
466
+ x = self.conv_post(x)
467
+ x = torch.tanh(x)
468
+ return x
469
+
470
+ def remove_weight_norm(self):
471
+ for l in self.ups:
472
+ remove_weight_norm(l)
473
+ for l in self.resblocks:
474
+ l.remove_weight_norm()
475
+
476
+
477
+ sr2sr = {
478
+ "32k": 32000,
479
+ "40k": 40000,
480
+ "48k": 48000,
481
+ }
482
+
483
+
484
+ class SynthesizerTrnMs256NSFSid(nn.Module):
485
+ def __init__(
486
+ self,
487
+ spec_channels,
488
+ segment_size,
489
+ inter_channels,
490
+ hidden_channels,
491
+ filter_channels,
492
+ n_heads,
493
+ n_layers,
494
+ kernel_size,
495
+ p_dropout,
496
+ resblock,
497
+ resblock_kernel_sizes,
498
+ resblock_dilation_sizes,
499
+ upsample_rates,
500
+ upsample_initial_channel,
501
+ upsample_kernel_sizes,
502
+ spk_embed_dim,
503
+ gin_channels,
504
+ emb_channels,
505
+ sr,
506
+ **kwargs
507
+ ):
508
+ super().__init__()
509
+ if type(sr) == type("strr"):
510
+ sr = sr2sr[sr]
511
+ self.spec_channels = spec_channels
512
+ self.inter_channels = inter_channels
513
+ self.hidden_channels = hidden_channels
514
+ self.filter_channels = filter_channels
515
+ self.n_heads = n_heads
516
+ self.n_layers = n_layers
517
+ self.kernel_size = kernel_size
518
+ self.p_dropout = p_dropout
519
+ self.resblock = resblock
520
+ self.resblock_kernel_sizes = resblock_kernel_sizes
521
+ self.resblock_dilation_sizes = resblock_dilation_sizes
522
+ self.upsample_rates = upsample_rates
523
+ self.upsample_initial_channel = upsample_initial_channel
524
+ self.upsample_kernel_sizes = upsample_kernel_sizes
525
+ self.segment_size = segment_size
526
+ self.gin_channels = gin_channels
527
+ self.emb_channels = emb_channels
528
+ self.sr = sr
529
+ # self.hop_length = hop_length#
530
+ self.spk_embed_dim = spk_embed_dim
531
+ self.enc_p = TextEncoder(
532
+ inter_channels,
533
+ hidden_channels,
534
+ filter_channels,
535
+ emb_channels,
536
+ n_heads,
537
+ n_layers,
538
+ kernel_size,
539
+ p_dropout,
540
+ )
541
+ self.dec = GeneratorNSF(
542
+ inter_channels,
543
+ resblock,
544
+ resblock_kernel_sizes,
545
+ resblock_dilation_sizes,
546
+ upsample_rates,
547
+ upsample_initial_channel,
548
+ upsample_kernel_sizes,
549
+ gin_channels=gin_channels,
550
+ sr=sr,
551
+ is_half=kwargs["is_half"],
552
+ )
553
+ self.enc_q = PosteriorEncoder(
554
+ spec_channels,
555
+ inter_channels,
556
+ hidden_channels,
557
+ 5,
558
+ 1,
559
+ 16,
560
+ gin_channels=gin_channels,
561
+ )
562
+ self.flow = ResidualCouplingBlock(
563
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
564
+ )
565
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
566
+ print(
567
+ "gin_channels:",
568
+ gin_channels,
569
+ "self.spk_embed_dim:",
570
+ self.spk_embed_dim,
571
+ "emb_channels:",
572
+ emb_channels,
573
+ )
574
+
575
+ def remove_weight_norm(self):
576
+ self.dec.remove_weight_norm()
577
+ self.flow.remove_weight_norm()
578
+ self.enc_q.remove_weight_norm()
579
+
580
+ def forward(
581
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
582
+ ): # 这里ds是id,[bs,1]
583
+ # print(1,pitch.shape)#[bs,t]
584
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
585
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
586
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
587
+ z_p = self.flow(z, y_mask, g=g)
588
+ z_slice, ids_slice = commons.rand_slice_segments(
589
+ z, y_lengths, self.segment_size
590
+ )
591
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
592
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
593
+ # print(-2,pitchf.shape,z_slice.shape)
594
+ o = self.dec(z_slice, pitchf, g=g)
595
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
596
+
597
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
598
+ g = self.emb_g(sid).unsqueeze(-1)
599
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
600
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
601
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
602
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
603
+ return o, x_mask, (z, z_p, m_p, logs_p)
604
+
605
+
606
+ class SynthesizerTrnMs256NSFSidNono(nn.Module):
607
+ def __init__(
608
+ self,
609
+ spec_channels,
610
+ segment_size,
611
+ inter_channels,
612
+ hidden_channels,
613
+ filter_channels,
614
+ n_heads,
615
+ n_layers,
616
+ kernel_size,
617
+ p_dropout,
618
+ resblock,
619
+ resblock_kernel_sizes,
620
+ resblock_dilation_sizes,
621
+ upsample_rates,
622
+ upsample_initial_channel,
623
+ upsample_kernel_sizes,
624
+ spk_embed_dim,
625
+ gin_channels,
626
+ emb_channels,
627
+ sr=None,
628
+ **kwargs
629
+ ):
630
+ super().__init__()
631
+ self.spec_channels = spec_channels
632
+ self.inter_channels = inter_channels
633
+ self.hidden_channels = hidden_channels
634
+ self.filter_channels = filter_channels
635
+ self.n_heads = n_heads
636
+ self.n_layers = n_layers
637
+ self.kernel_size = kernel_size
638
+ self.p_dropout = p_dropout
639
+ self.resblock = resblock
640
+ self.resblock_kernel_sizes = resblock_kernel_sizes
641
+ self.resblock_dilation_sizes = resblock_dilation_sizes
642
+ self.upsample_rates = upsample_rates
643
+ self.upsample_initial_channel = upsample_initial_channel
644
+ self.upsample_kernel_sizes = upsample_kernel_sizes
645
+ self.segment_size = segment_size
646
+ self.gin_channels = gin_channels
647
+ self.emb_channels = emb_channels
648
+ self.sr = sr
649
+ # self.hop_length = hop_length#
650
+ self.spk_embed_dim = spk_embed_dim
651
+ self.enc_p = TextEncoder(
652
+ inter_channels,
653
+ hidden_channels,
654
+ filter_channels,
655
+ emb_channels,
656
+ n_heads,
657
+ n_layers,
658
+ kernel_size,
659
+ p_dropout,
660
+ f0=False,
661
+ )
662
+ self.dec = Generator(
663
+ inter_channels,
664
+ resblock,
665
+ resblock_kernel_sizes,
666
+ resblock_dilation_sizes,
667
+ upsample_rates,
668
+ upsample_initial_channel,
669
+ upsample_kernel_sizes,
670
+ gin_channels=gin_channels,
671
+ )
672
+ self.enc_q = PosteriorEncoder(
673
+ spec_channels,
674
+ inter_channels,
675
+ hidden_channels,
676
+ 5,
677
+ 1,
678
+ 16,
679
+ gin_channels=gin_channels,
680
+ )
681
+ self.flow = ResidualCouplingBlock(
682
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
683
+ )
684
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
685
+ print(
686
+ "gin_channels:",
687
+ gin_channels,
688
+ "self.spk_embed_dim:",
689
+ self.spk_embed_dim,
690
+ "emb_channels:",
691
+ emb_channels,
692
+ )
693
+
694
+ def remove_weight_norm(self):
695
+ self.dec.remove_weight_norm()
696
+ self.flow.remove_weight_norm()
697
+ self.enc_q.remove_weight_norm()
698
+
699
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
700
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
701
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
702
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
703
+ z_p = self.flow(z, y_mask, g=g)
704
+ z_slice, ids_slice = commons.rand_slice_segments(
705
+ z, y_lengths, self.segment_size
706
+ )
707
+ o = self.dec(z_slice, g=g)
708
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
709
+
710
+ def infer(self, phone, phone_lengths, sid, max_len=None):
711
+ g = self.emb_g(sid).unsqueeze(-1)
712
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
713
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
714
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
715
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
716
+ return o, x_mask, (z, z_p, m_p, logs_p)
717
+
718
+
719
+ class DiscriminatorS(torch.nn.Module):
720
+ def __init__(self, use_spectral_norm=False):
721
+ super(DiscriminatorS, self).__init__()
722
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
723
+ self.convs = nn.ModuleList(
724
+ [
725
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
726
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
727
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
728
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
729
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
730
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
731
+ ]
732
+ )
733
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
734
+
735
+ def forward(self, x):
736
+ fmap = []
737
+
738
+ for l in self.convs:
739
+ x = l(x)
740
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
741
+ fmap.append(x)
742
+ x = self.conv_post(x)
743
+ fmap.append(x)
744
+ x = torch.flatten(x, 1, -1)
745
+
746
+ return x, fmap
747
+
748
+
749
+ class DiscriminatorP(torch.nn.Module):
750
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
751
+ super(DiscriminatorP, self).__init__()
752
+ self.period = period
753
+ self.use_spectral_norm = use_spectral_norm
754
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
755
+ self.convs = nn.ModuleList(
756
+ [
757
+ norm_f(
758
+ Conv2d(
759
+ 1,
760
+ 32,
761
+ (kernel_size, 1),
762
+ (stride, 1),
763
+ padding=(get_padding(kernel_size, 1), 0),
764
+ )
765
+ ),
766
+ norm_f(
767
+ Conv2d(
768
+ 32,
769
+ 128,
770
+ (kernel_size, 1),
771
+ (stride, 1),
772
+ padding=(get_padding(kernel_size, 1), 0),
773
+ )
774
+ ),
775
+ norm_f(
776
+ Conv2d(
777
+ 128,
778
+ 512,
779
+ (kernel_size, 1),
780
+ (stride, 1),
781
+ padding=(get_padding(kernel_size, 1), 0),
782
+ )
783
+ ),
784
+ norm_f(
785
+ Conv2d(
786
+ 512,
787
+ 1024,
788
+ (kernel_size, 1),
789
+ (stride, 1),
790
+ padding=(get_padding(kernel_size, 1), 0),
791
+ )
792
+ ),
793
+ norm_f(
794
+ Conv2d(
795
+ 1024,
796
+ 1024,
797
+ (kernel_size, 1),
798
+ 1,
799
+ padding=(get_padding(kernel_size, 1), 0),
800
+ )
801
+ ),
802
+ ]
803
+ )
804
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
805
+
806
+ def forward(self, x):
807
+ fmap = []
808
+
809
+ # 1d to 2d
810
+ b, c, t = x.shape
811
+ if t % self.period != 0: # pad first
812
+ n_pad = self.period - (t % self.period)
813
+ x = F.pad(x, (0, n_pad), "reflect")
814
+ t = t + n_pad
815
+ x = x.view(b, c, t // self.period, self.period)
816
+
817
+ for l in self.convs:
818
+ x = l(x)
819
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
820
+ fmap.append(x)
821
+ x = self.conv_post(x)
822
+ fmap.append(x)
823
+ x = torch.flatten(x, 1, -1)
824
+
825
+ return x, fmap
826
+
827
+
828
+ class MultiPeriodDiscriminator(torch.nn.Module):
829
+ def __init__(self, use_spectral_norm=False, periods=[2, 3, 5, 7, 11, 17]):
830
+ super(MultiPeriodDiscriminator, self).__init__()
831
+
832
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
833
+ discs = discs + [
834
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
835
+ ]
836
+ self.discriminators = nn.ModuleList(discs)
837
+
838
+ def forward(self, y, y_hat):
839
+ y_d_rs = [] #
840
+ y_d_gs = []
841
+ fmap_rs = []
842
+ fmap_gs = []
843
+ for i, d in enumerate(self.discriminators):
844
+ y_d_r, fmap_r = d(y)
845
+ y_d_g, fmap_g = d(y_hat)
846
+ # for j in range(len(fmap_r)):
847
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
848
+ y_d_rs.append(y_d_r)
849
+ y_d_gs.append(y_d_g)
850
+ fmap_rs.append(fmap_r)
851
+ fmap_gs.append(fmap_g)
852
+
853
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
lib/rvc/modules.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import Conv1d
6
+ from torch.nn import functional as F
7
+ from torch.nn.utils import remove_weight_norm, weight_norm
8
+
9
+ from . import commons
10
+ from .commons import get_padding, init_weights
11
+ from .transforms import piecewise_rational_quadratic_transform
12
+
13
+ LRELU_SLOPE = 0.1
14
+
15
+
16
+ class LayerNorm(nn.Module):
17
+ def __init__(self, channels, eps=1e-5):
18
+ super().__init__()
19
+ self.channels = channels
20
+ self.eps = eps
21
+
22
+ self.gamma = nn.Parameter(torch.ones(channels))
23
+ self.beta = nn.Parameter(torch.zeros(channels))
24
+
25
+ def forward(self, x):
26
+ x = x.transpose(1, -1)
27
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
28
+ return x.transpose(1, -1)
29
+
30
+
31
+ class ConvReluNorm(nn.Module):
32
+ def __init__(
33
+ self,
34
+ in_channels,
35
+ hidden_channels,
36
+ out_channels,
37
+ kernel_size,
38
+ n_layers,
39
+ p_dropout,
40
+ ):
41
+ super().__init__()
42
+ self.in_channels = in_channels
43
+ self.hidden_channels = hidden_channels
44
+ self.out_channels = out_channels
45
+ self.kernel_size = kernel_size
46
+ self.n_layers = n_layers
47
+ self.p_dropout = p_dropout
48
+ assert n_layers > 1, "Number of layers should be larger than 0."
49
+
50
+ self.conv_layers = nn.ModuleList()
51
+ self.norm_layers = nn.ModuleList()
52
+ self.conv_layers.append(
53
+ nn.Conv1d(
54
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
55
+ )
56
+ )
57
+ self.norm_layers.append(LayerNorm(hidden_channels))
58
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
59
+ for _ in range(n_layers - 1):
60
+ self.conv_layers.append(
61
+ nn.Conv1d(
62
+ hidden_channels,
63
+ hidden_channels,
64
+ kernel_size,
65
+ padding=kernel_size // 2,
66
+ )
67
+ )
68
+ self.norm_layers.append(LayerNorm(hidden_channels))
69
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
70
+ self.proj.weight.data.zero_()
71
+ self.proj.bias.data.zero_()
72
+
73
+ def forward(self, x, x_mask):
74
+ x_org = x
75
+ for i in range(self.n_layers):
76
+ x = self.conv_layers[i](x * x_mask)
77
+ x = self.norm_layers[i](x)
78
+ x = self.relu_drop(x)
79
+ x = x_org + self.proj(x)
80
+ return x * x_mask
81
+
82
+
83
+ class DDSConv(nn.Module):
84
+ """
85
+ Dialted and Depth-Separable Convolution
86
+ """
87
+
88
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
89
+ super().__init__()
90
+ self.channels = channels
91
+ self.kernel_size = kernel_size
92
+ self.n_layers = n_layers
93
+ self.p_dropout = p_dropout
94
+
95
+ self.drop = nn.Dropout(p_dropout)
96
+ self.convs_sep = nn.ModuleList()
97
+ self.convs_1x1 = nn.ModuleList()
98
+ self.norms_1 = nn.ModuleList()
99
+ self.norms_2 = nn.ModuleList()
100
+ for i in range(n_layers):
101
+ dilation = kernel_size**i
102
+ padding = (kernel_size * dilation - dilation) // 2
103
+ self.convs_sep.append(
104
+ nn.Conv1d(
105
+ channels,
106
+ channels,
107
+ kernel_size,
108
+ groups=channels,
109
+ dilation=dilation,
110
+ padding=padding,
111
+ )
112
+ )
113
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
114
+ self.norms_1.append(LayerNorm(channels))
115
+ self.norms_2.append(LayerNorm(channels))
116
+
117
+ def forward(self, x, x_mask, g=None):
118
+ if g is not None:
119
+ x = x + g
120
+ for i in range(self.n_layers):
121
+ y = self.convs_sep[i](x * x_mask)
122
+ y = self.norms_1[i](y)
123
+ y = F.gelu(y)
124
+ y = self.convs_1x1[i](y)
125
+ y = self.norms_2[i](y)
126
+ y = F.gelu(y)
127
+ y = self.drop(y)
128
+ x = x + y
129
+ return x * x_mask
130
+
131
+
132
+ class WN(torch.nn.Module):
133
+ def __init__(
134
+ self,
135
+ hidden_channels,
136
+ kernel_size,
137
+ dilation_rate,
138
+ n_layers,
139
+ gin_channels=0,
140
+ p_dropout=0,
141
+ ):
142
+ super(WN, self).__init__()
143
+ assert kernel_size % 2 == 1
144
+ self.hidden_channels = hidden_channels
145
+ self.kernel_size = (kernel_size,)
146
+ self.dilation_rate = dilation_rate
147
+ self.n_layers = n_layers
148
+ self.gin_channels = gin_channels
149
+ self.p_dropout = p_dropout
150
+
151
+ self.in_layers = torch.nn.ModuleList()
152
+ self.res_skip_layers = torch.nn.ModuleList()
153
+ self.drop = nn.Dropout(p_dropout)
154
+
155
+ if gin_channels != 0:
156
+ cond_layer = torch.nn.Conv1d(
157
+ gin_channels, 2 * hidden_channels * n_layers, 1
158
+ )
159
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
160
+
161
+ for i in range(n_layers):
162
+ dilation = dilation_rate**i
163
+ padding = int((kernel_size * dilation - dilation) / 2)
164
+ in_layer = torch.nn.Conv1d(
165
+ hidden_channels,
166
+ 2 * hidden_channels,
167
+ kernel_size,
168
+ dilation=dilation,
169
+ padding=padding,
170
+ )
171
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
172
+ self.in_layers.append(in_layer)
173
+
174
+ # last one is not necessary
175
+ if i < n_layers - 1:
176
+ res_skip_channels = 2 * hidden_channels
177
+ else:
178
+ res_skip_channels = hidden_channels
179
+
180
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
181
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
182
+ self.res_skip_layers.append(res_skip_layer)
183
+
184
+ def forward(self, x, x_mask, g=None, **kwargs):
185
+ output = torch.zeros_like(x)
186
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
187
+
188
+ if g is not None:
189
+ g = self.cond_layer(g)
190
+
191
+ for i in range(self.n_layers):
192
+ x_in = self.in_layers[i](x)
193
+ if g is not None:
194
+ cond_offset = i * 2 * self.hidden_channels
195
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
196
+ else:
197
+ g_l = torch.zeros_like(x_in)
198
+
199
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
200
+ acts = self.drop(acts)
201
+
202
+ res_skip_acts = self.res_skip_layers[i](acts)
203
+ if i < self.n_layers - 1:
204
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
205
+ x = (x + res_acts) * x_mask
206
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
207
+ else:
208
+ output = output + res_skip_acts
209
+ return output * x_mask
210
+
211
+ def remove_weight_norm(self):
212
+ if self.gin_channels != 0:
213
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
214
+ for l in self.in_layers:
215
+ torch.nn.utils.remove_weight_norm(l)
216
+ for l in self.res_skip_layers:
217
+ torch.nn.utils.remove_weight_norm(l)
218
+
219
+
220
+ class ResBlock1(torch.nn.Module):
221
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
222
+ super(ResBlock1, self).__init__()
223
+ self.convs1 = nn.ModuleList(
224
+ [
225
+ weight_norm(
226
+ Conv1d(
227
+ channels,
228
+ channels,
229
+ kernel_size,
230
+ 1,
231
+ dilation=dilation[0],
232
+ padding=get_padding(kernel_size, dilation[0]),
233
+ )
234
+ ),
235
+ weight_norm(
236
+ Conv1d(
237
+ channels,
238
+ channels,
239
+ kernel_size,
240
+ 1,
241
+ dilation=dilation[1],
242
+ padding=get_padding(kernel_size, dilation[1]),
243
+ )
244
+ ),
245
+ weight_norm(
246
+ Conv1d(
247
+ channels,
248
+ channels,
249
+ kernel_size,
250
+ 1,
251
+ dilation=dilation[2],
252
+ padding=get_padding(kernel_size, dilation[2]),
253
+ )
254
+ ),
255
+ ]
256
+ )
257
+ self.convs1.apply(init_weights)
258
+
259
+ self.convs2 = nn.ModuleList(
260
+ [
261
+ weight_norm(
262
+ Conv1d(
263
+ channels,
264
+ channels,
265
+ kernel_size,
266
+ 1,
267
+ dilation=1,
268
+ padding=get_padding(kernel_size, 1),
269
+ )
270
+ ),
271
+ weight_norm(
272
+ Conv1d(
273
+ channels,
274
+ channels,
275
+ kernel_size,
276
+ 1,
277
+ dilation=1,
278
+ padding=get_padding(kernel_size, 1),
279
+ )
280
+ ),
281
+ weight_norm(
282
+ Conv1d(
283
+ channels,
284
+ channels,
285
+ kernel_size,
286
+ 1,
287
+ dilation=1,
288
+ padding=get_padding(kernel_size, 1),
289
+ )
290
+ ),
291
+ ]
292
+ )
293
+ self.convs2.apply(init_weights)
294
+
295
+ def forward(self, x, x_mask=None):
296
+ for c1, c2 in zip(self.convs1, self.convs2):
297
+ xt = F.leaky_relu(x, LRELU_SLOPE)
298
+ if x_mask is not None:
299
+ xt = xt * x_mask
300
+ xt = c1(xt)
301
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
302
+ if x_mask is not None:
303
+ xt = xt * x_mask
304
+ xt = c2(xt)
305
+ x = xt + x
306
+ if x_mask is not None:
307
+ x = x * x_mask
308
+ return x
309
+
310
+ def remove_weight_norm(self):
311
+ for l in self.convs1:
312
+ remove_weight_norm(l)
313
+ for l in self.convs2:
314
+ remove_weight_norm(l)
315
+
316
+
317
+ class ResBlock2(torch.nn.Module):
318
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
319
+ super(ResBlock2, self).__init__()
320
+ self.convs = nn.ModuleList(
321
+ [
322
+ weight_norm(
323
+ Conv1d(
324
+ channels,
325
+ channels,
326
+ kernel_size,
327
+ 1,
328
+ dilation=dilation[0],
329
+ padding=get_padding(kernel_size, dilation[0]),
330
+ )
331
+ ),
332
+ weight_norm(
333
+ Conv1d(
334
+ channels,
335
+ channels,
336
+ kernel_size,
337
+ 1,
338
+ dilation=dilation[1],
339
+ padding=get_padding(kernel_size, dilation[1]),
340
+ )
341
+ ),
342
+ ]
343
+ )
344
+ self.convs.apply(init_weights)
345
+
346
+ def forward(self, x, x_mask=None):
347
+ for c in self.convs:
348
+ xt = F.leaky_relu(x, LRELU_SLOPE)
349
+ if x_mask is not None:
350
+ xt = xt * x_mask
351
+ xt = c(xt)
352
+ x = xt + x
353
+ if x_mask is not None:
354
+ x = x * x_mask
355
+ return x
356
+
357
+ def remove_weight_norm(self):
358
+ for l in self.convs:
359
+ remove_weight_norm(l)
360
+
361
+
362
+ class Log(nn.Module):
363
+ def forward(self, x, x_mask, reverse=False, **kwargs):
364
+ if not reverse:
365
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
366
+ logdet = torch.sum(-y, [1, 2])
367
+ return y, logdet
368
+ else:
369
+ x = torch.exp(x) * x_mask
370
+ return x
371
+
372
+
373
+ class Flip(nn.Module):
374
+ def forward(self, x, *args, reverse=False, **kwargs):
375
+ x = torch.flip(x, [1])
376
+ if not reverse:
377
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
378
+ return x, logdet
379
+ else:
380
+ return x
381
+
382
+
383
+ class ElementwiseAffine(nn.Module):
384
+ def __init__(self, channels):
385
+ super().__init__()
386
+ self.channels = channels
387
+ self.m = nn.Parameter(torch.zeros(channels, 1))
388
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
389
+
390
+ def forward(self, x, x_mask, reverse=False, **kwargs):
391
+ if not reverse:
392
+ y = self.m + torch.exp(self.logs) * x
393
+ y = y * x_mask
394
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
395
+ return y, logdet
396
+ else:
397
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
398
+ return x
399
+
400
+
401
+ class ResidualCouplingLayer(nn.Module):
402
+ def __init__(
403
+ self,
404
+ channels,
405
+ hidden_channels,
406
+ kernel_size,
407
+ dilation_rate,
408
+ n_layers,
409
+ p_dropout=0,
410
+ gin_channels=0,
411
+ mean_only=False,
412
+ ):
413
+ assert channels % 2 == 0, "channels should be divisible by 2"
414
+ super().__init__()
415
+ self.channels = channels
416
+ self.hidden_channels = hidden_channels
417
+ self.kernel_size = kernel_size
418
+ self.dilation_rate = dilation_rate
419
+ self.n_layers = n_layers
420
+ self.half_channels = channels // 2
421
+ self.mean_only = mean_only
422
+
423
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
424
+ self.enc = WN(
425
+ hidden_channels,
426
+ kernel_size,
427
+ dilation_rate,
428
+ n_layers,
429
+ p_dropout=p_dropout,
430
+ gin_channels=gin_channels,
431
+ )
432
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
433
+ self.post.weight.data.zero_()
434
+ self.post.bias.data.zero_()
435
+
436
+ def forward(self, x, x_mask, g=None, reverse=False):
437
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
438
+ h = self.pre(x0) * x_mask
439
+ h = self.enc(h, x_mask, g=g)
440
+ stats = self.post(h) * x_mask
441
+ if not self.mean_only:
442
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
443
+ else:
444
+ m = stats
445
+ logs = torch.zeros_like(m)
446
+
447
+ if not reverse:
448
+ x1 = m + x1 * torch.exp(logs) * x_mask
449
+ x = torch.cat([x0, x1], 1)
450
+ logdet = torch.sum(logs, [1, 2])
451
+ return x, logdet
452
+ else:
453
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
454
+ x = torch.cat([x0, x1], 1)
455
+ return x
456
+
457
+ def remove_weight_norm(self):
458
+ self.enc.remove_weight_norm()
459
+
460
+
461
+ class ConvFlow(nn.Module):
462
+ def __init__(
463
+ self,
464
+ in_channels,
465
+ filter_channels,
466
+ kernel_size,
467
+ n_layers,
468
+ num_bins=10,
469
+ tail_bound=5.0,
470
+ ):
471
+ super().__init__()
472
+ self.in_channels = in_channels
473
+ self.filter_channels = filter_channels
474
+ self.kernel_size = kernel_size
475
+ self.n_layers = n_layers
476
+ self.num_bins = num_bins
477
+ self.tail_bound = tail_bound
478
+ self.half_channels = in_channels // 2
479
+
480
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
481
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
482
+ self.proj = nn.Conv1d(
483
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
484
+ )
485
+ self.proj.weight.data.zero_()
486
+ self.proj.bias.data.zero_()
487
+
488
+ def forward(self, x, x_mask, g=None, reverse=False):
489
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
490
+ h = self.pre(x0)
491
+ h = self.convs(h, x_mask, g=g)
492
+ h = self.proj(h) * x_mask
493
+
494
+ b, c, t = x0.shape
495
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
496
+
497
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
498
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
499
+ self.filter_channels
500
+ )
501
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
502
+
503
+ x1, logabsdet = piecewise_rational_quadratic_transform(
504
+ x1,
505
+ unnormalized_widths,
506
+ unnormalized_heights,
507
+ unnormalized_derivatives,
508
+ inverse=reverse,
509
+ tails="linear",
510
+ tail_bound=self.tail_bound,
511
+ )
512
+
513
+ x = torch.cat([x0, x1], 1) * x_mask
514
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
515
+ if not reverse:
516
+ return x, logdet
517
+ else:
518
+ return x
lib/rvc/pipeline.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ from typing import *
4
+
5
+ import faiss
6
+ import numpy as np
7
+ import pyworld
8
+ import scipy.signal as signal
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torchcrepe
12
+ from torch import Tensor
13
+ # from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip
14
+ from fairseq.models.hubert import HubertModel
15
+
16
+ from .models import SynthesizerTrnMs256NSFSid
17
+
18
+
19
+ class VocalConvertPipeline(object):
20
+ def __init__(self, tgt_sr: int, device: Union[str, torch.device], is_half: bool):
21
+ if isinstance(device, str):
22
+ device = torch.device(device)
23
+ if device.type == "cuda":
24
+ vram = torch.cuda.get_device_properties(device).total_memory / 1024**3
25
+ else:
26
+ vram = None
27
+
28
+ if vram is not None and vram <= 4:
29
+ self.x_pad = 1
30
+ self.x_query = 5
31
+ self.x_center = 30
32
+ self.x_max = 32
33
+ elif vram is not None and vram <= 5:
34
+ self.x_pad = 1
35
+ self.x_query = 6
36
+ self.x_center = 38
37
+ self.x_max = 41
38
+ else:
39
+ self.x_pad = 3
40
+ self.x_query = 10
41
+ self.x_center = 60
42
+ self.x_max = 65
43
+
44
+ self.sr = 16000 # hubert input sample rate
45
+ self.window = 160 # hubert input window
46
+ self.t_pad = self.sr * self.x_pad # padding time for each utterance
47
+ self.t_pad_tgt = tgt_sr * self.x_pad
48
+ self.t_pad2 = self.t_pad * 2
49
+ self.t_query = self.sr * self.x_query # query time before and after query point
50
+ self.t_center = self.sr * self.x_center # query cut point position
51
+ self.t_max = self.sr * self.x_max # max time for no query
52
+ self.device = device
53
+ self.is_half = is_half
54
+
55
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
56
+ # Get cuda device
57
+ if torch.cuda.is_available():
58
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
59
+ elif torch.backends.mps.is_available():
60
+ return torch.device("mps")
61
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
62
+ # Else wise return the "cpu" as a torch device,
63
+ return torch.device("cpu")
64
+
65
+ def get_f0_crepe_computation(
66
+ self,
67
+ x,
68
+ f0_min,
69
+ f0_max,
70
+ p_len,
71
+ hop_length=64, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
72
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
73
+ ):
74
+ x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
75
+ x /= np.quantile(np.abs(x), 0.999)
76
+ torch_device = self.get_optimal_torch_device()
77
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
78
+ audio = torch.unsqueeze(audio, dim=0)
79
+ if audio.ndim == 2 and audio.shape[0] > 1:
80
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
81
+ audio = audio.detach()
82
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
83
+ pitch: Tensor = torchcrepe.predict(
84
+ audio,
85
+ self.sr,
86
+ hop_length,
87
+ f0_min,
88
+ f0_max,
89
+ model,
90
+ batch_size=hop_length * 2,
91
+ device=torch_device,
92
+ pad=True
93
+ )
94
+ p_len = p_len or x.shape[0] // hop_length
95
+ # Resize the pitch for final f0
96
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
97
+ source[source < 0.001] = np.nan
98
+ target = np.interp(
99
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
100
+ np.arange(0, len(source)),
101
+ source
102
+ )
103
+ f0 = np.nan_to_num(target)
104
+ return f0 # Resized f0
105
+
106
+ def get_f0_official_crepe_computation(
107
+ self,
108
+ x,
109
+ f0_min,
110
+ f0_max,
111
+ model="full",
112
+ ):
113
+ # Pick a batch size that doesn't cause memory errors on your gpu
114
+ batch_size = 512
115
+ # Compute pitch using first gpu
116
+ audio = torch.tensor(np.copy(x))[None].float()
117
+ f0, pd = torchcrepe.predict(
118
+ audio,
119
+ self.sr,
120
+ self.window,
121
+ f0_min,
122
+ f0_max,
123
+ model,
124
+ batch_size=batch_size,
125
+ device=self.device,
126
+ return_periodicity=True,
127
+ )
128
+ pd = torchcrepe.filter.median(pd, 3)
129
+ f0 = torchcrepe.filter.mean(f0, 3)
130
+ f0[pd < 0.1] = 0
131
+ f0 = f0[0].cpu().numpy()
132
+ return f0
133
+
134
+ def get_f0(
135
+ self,
136
+ x: np.ndarray,
137
+ p_len: int,
138
+ f0_up_key: int,
139
+ f0_method: str,
140
+ inp_f0: np.ndarray = None,
141
+ ):
142
+ f0_min = 50
143
+ f0_max = 1100
144
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
145
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
146
+
147
+ if f0_method == "harvest":
148
+ f0, t = pyworld.harvest(
149
+ x.astype(np.double),
150
+ fs=self.sr,
151
+ f0_ceil=f0_max,
152
+ f0_floor=f0_min,
153
+ frame_period=10,
154
+ )
155
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
156
+ f0 = signal.medfilt(f0, 3)
157
+ elif f0_method == "dio":
158
+ f0, t = pyworld.dio(
159
+ x.astype(np.double),
160
+ fs=self.sr,
161
+ f0_ceil=f0_max,
162
+ f0_floor=f0_min,
163
+ frame_period=10,
164
+ )
165
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
166
+ f0 = signal.medfilt(f0, 3)
167
+ elif f0_method == "mangio-crepe":
168
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, 160, "full")
169
+ elif f0_method == "crepe":
170
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "full")
171
+
172
+ f0 *= pow(2, f0_up_key / 12)
173
+ tf0 = self.sr // self.window # f0 points per second
174
+ if inp_f0 is not None:
175
+ delta_t = np.round(
176
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
177
+ ).astype("int16")
178
+ replace_f0 = np.interp(
179
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
180
+ )
181
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
182
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
183
+ :shape
184
+ ]
185
+
186
+ f0bak = f0.copy()
187
+ f0_mel = 1127 * np.log(1 + f0 / 700)
188
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
189
+ f0_mel_max - f0_mel_min
190
+ ) + 1
191
+ f0_mel[f0_mel <= 1] = 1
192
+ f0_mel[f0_mel > 255] = 255
193
+ f0_coarse = np.rint(f0_mel).astype(np.int)
194
+ return f0_coarse, f0bak # 1-0
195
+
196
+ def _convert(
197
+ self,
198
+ model: HubertModel,
199
+ embedding_output_layer: int,
200
+ net_g: SynthesizerTrnMs256NSFSid,
201
+ sid: int,
202
+ audio: np.ndarray,
203
+ pitch: np.ndarray,
204
+ pitchf: np.ndarray,
205
+ index: faiss.IndexIVFFlat,
206
+ big_npy: np.ndarray,
207
+ index_rate: float,
208
+ ):
209
+ feats = torch.from_numpy(audio)
210
+ if self.is_half:
211
+ feats = feats.half()
212
+ else:
213
+ feats = feats.float()
214
+ if feats.dim() == 2: # double channels
215
+ feats = feats.mean(-1)
216
+ assert feats.dim() == 1, feats.dim()
217
+ feats = feats.view(1, -1)
218
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
219
+
220
+ half_support = (
221
+ self.device.type == "cuda"
222
+ and torch.cuda.get_device_capability(self.device)[0] >= 5.3
223
+ )
224
+ is_feats_dim_768 = net_g.emb_channels == 768
225
+
226
+ if isinstance(model, tuple):
227
+ feats = model[0](
228
+ feats.squeeze(0).squeeze(0).to(self.device),
229
+ return_tensors="pt",
230
+ sampling_rate=16000,
231
+ )
232
+ if self.is_half:
233
+ feats = feats.input_values.to(self.device).half()
234
+ else:
235
+ feats = feats.input_values.to(self.device)
236
+ with torch.no_grad():
237
+ if is_feats_dim_768:
238
+ feats = model[1](feats).last_hidden_state
239
+ else:
240
+ feats = model[1](feats).extract_features
241
+ else:
242
+ inputs = {
243
+ "source": feats.half().to(self.device)
244
+ if half_support
245
+ else feats.to(self.device),
246
+ "padding_mask": padding_mask.to(self.device),
247
+ "output_layer": embedding_output_layer,
248
+ }
249
+
250
+ if not half_support:
251
+ model = model.float()
252
+ inputs["source"] = inputs["source"].float()
253
+
254
+ with torch.no_grad():
255
+ logits = model.extract_features(**inputs)
256
+ if is_feats_dim_768:
257
+ feats = logits[0]
258
+ else:
259
+ feats = model.final_proj(logits[0])
260
+
261
+ if (
262
+ isinstance(index, type(None)) == False
263
+ and isinstance(big_npy, type(None)) == False
264
+ and index_rate != 0
265
+ ):
266
+ npy = feats[0].cpu().numpy()
267
+ if self.is_half:
268
+ npy = npy.astype("float32")
269
+
270
+ score, ix = index.search(npy, k=8)
271
+ weight = np.square(1 / score)
272
+ weight /= weight.sum(axis=1, keepdims=True)
273
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
274
+
275
+ if self.is_half:
276
+ npy = npy.astype("float16")
277
+ feats = (
278
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
279
+ + (1 - index_rate) * feats
280
+ )
281
+
282
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
283
+
284
+ p_len = audio.shape[0] // self.window
285
+ if feats.shape[1] < p_len:
286
+ p_len = feats.shape[1]
287
+ if pitch != None and pitchf != None:
288
+ pitch = pitch[:, :p_len]
289
+ pitchf = pitchf[:, :p_len]
290
+ p_len = torch.tensor([p_len], device=self.device).long()
291
+ with torch.no_grad():
292
+ if pitch != None and pitchf != None:
293
+ audio1 = (
294
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
295
+ .data.cpu()
296
+ .float()
297
+ .numpy()
298
+ .astype(np.int16)
299
+ )
300
+ else:
301
+ audio1 = (
302
+ (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
303
+ .data.cpu()
304
+ .float()
305
+ .numpy()
306
+ .astype(np.int16)
307
+ )
308
+ del feats, p_len, padding_mask
309
+ if torch.cuda.is_available():
310
+ torch.cuda.empty_cache()
311
+ return audio1
312
+
313
+ def __call__(
314
+ self,
315
+ model: HubertModel,
316
+ embedding_output_layer: int,
317
+ net_g: SynthesizerTrnMs256NSFSid,
318
+ sid: int,
319
+ audio: np.ndarray,
320
+ transpose: int,
321
+ f0_method: str,
322
+ file_index: str,
323
+ index_rate: float,
324
+ if_f0: bool,
325
+ f0_file: str = None,
326
+ ):
327
+ if file_index != "" and os.path.exists(file_index) and index_rate != 0:
328
+ try:
329
+ index = faiss.read_index(file_index)
330
+ # big_npy = np.load(file_big_npy)
331
+ big_npy = index.reconstruct_n(0, index.ntotal)
332
+ except:
333
+ traceback.print_exc()
334
+ index = big_npy = None
335
+ else:
336
+ index = big_npy = None
337
+
338
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
339
+ audio = signal.filtfilt(bh, ah, audio)
340
+
341
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
342
+ opt_ts = []
343
+ if audio_pad.shape[0] > self.t_max:
344
+ audio_sum = np.zeros_like(audio)
345
+ for i in range(self.window):
346
+ audio_sum += audio_pad[i : i - self.window]
347
+ for t in range(self.t_center, audio.shape[0], self.t_center):
348
+ opt_ts.append(
349
+ t
350
+ - self.t_query
351
+ + np.where(
352
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
353
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
354
+ )[0][0]
355
+ )
356
+
357
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
358
+ p_len = audio_pad.shape[0] // self.window
359
+ inp_f0 = None
360
+ if hasattr(f0_file, "name"):
361
+ try:
362
+ with open(f0_file.name, "r") as f:
363
+ lines = f.read().strip("\n").split("\n")
364
+ inp_f0 = []
365
+ for line in lines:
366
+ inp_f0.append([float(i) for i in line.split(",")])
367
+ inp_f0 = np.array(inp_f0, dtype="float32")
368
+ except:
369
+ traceback.print_exc()
370
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
371
+ pitch, pitchf = None, None
372
+ if if_f0 == 1:
373
+ pitch, pitchf = self.get_f0(audio_pad, p_len, transpose, f0_method, inp_f0)
374
+ pitch = pitch[:p_len]
375
+ pitchf = pitchf[:p_len]
376
+ if self.device.type == "mps":
377
+ pitchf = pitchf.astype(np.float32)
378
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
379
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
380
+
381
+ audio_opt = []
382
+
383
+ s = 0
384
+ t = None
385
+
386
+ for t in opt_ts:
387
+ t = t // self.window * self.window
388
+ if if_f0 == 1:
389
+ audio_opt.append(
390
+ self._convert(
391
+ model,
392
+ embedding_output_layer,
393
+ net_g,
394
+ sid,
395
+ audio_pad[s : t + self.t_pad2 + self.window],
396
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
397
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
398
+ index,
399
+ big_npy,
400
+ index_rate,
401
+ )[self.t_pad_tgt : -self.t_pad_tgt]
402
+ )
403
+ else:
404
+ audio_opt.append(
405
+ self._convert(
406
+ model,
407
+ embedding_output_layer,
408
+ net_g,
409
+ sid,
410
+ audio_pad[s : t + self.t_pad2 + self.window],
411
+ None,
412
+ None,
413
+ index,
414
+ big_npy,
415
+ index_rate,
416
+ )[self.t_pad_tgt : -self.t_pad_tgt]
417
+ )
418
+ s = t
419
+ if if_f0 == 1:
420
+ audio_opt.append(
421
+ self._convert(
422
+ model,
423
+ embedding_output_layer,
424
+ net_g,
425
+ sid,
426
+ audio_pad[t:],
427
+ pitch[:, t // self.window :] if t is not None else pitch,
428
+ pitchf[:, t // self.window :] if t is not None else pitchf,
429
+ index,
430
+ big_npy,
431
+ index_rate,
432
+ )[self.t_pad_tgt : -self.t_pad_tgt]
433
+ )
434
+ else:
435
+ audio_opt.append(
436
+ self._convert(
437
+ model,
438
+ embedding_output_layer,
439
+ net_g,
440
+ sid,
441
+ audio_pad[t:],
442
+ None,
443
+ None,
444
+ index,
445
+ big_npy,
446
+ index_rate,
447
+ )[self.t_pad_tgt : -self.t_pad_tgt]
448
+ )
449
+ audio_opt = np.concatenate(audio_opt)
450
+ del pitch, pitchf, sid
451
+ if torch.cuda.is_available():
452
+ torch.cuda.empty_cache()
453
+ return audio_opt
lib/rvc/preprocessing/extract_f0.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from typing import *
5
+ import multiprocessing as mp
6
+
7
+ import numpy as np
8
+ import pyworld
9
+ import torch
10
+ import torchcrepe
11
+ from torch import Tensor
12
+ from tqdm import tqdm
13
+
14
+ from lib.rvc.utils import load_audio
15
+
16
+ def get_optimal_torch_device(index: int = 0) -> torch.device:
17
+ # Get cuda device
18
+ if torch.cuda.is_available():
19
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
20
+ elif torch.backends.mps.is_available():
21
+ return torch.device("mps")
22
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
23
+ # Else wise return the "cpu" as a torch device,
24
+ return torch.device("cpu")
25
+
26
+ def get_f0_official_crepe_computation(
27
+ x,
28
+ sr,
29
+ f0_min,
30
+ f0_max,
31
+ model="full",
32
+ ):
33
+ batch_size = 512
34
+ torch_device = get_optimal_torch_device()
35
+ audio = torch.tensor(np.copy(x))[None].float()
36
+ f0, pd = torchcrepe.predict(
37
+ audio,
38
+ sr,
39
+ 160,
40
+ f0_min,
41
+ f0_max,
42
+ model,
43
+ batch_size=batch_size,
44
+ device=torch_device,
45
+ return_periodicity=True,
46
+ )
47
+ pd = torchcrepe.filter.median(pd, 3)
48
+ f0 = torchcrepe.filter.mean(f0, 3)
49
+ f0[pd < 0.1] = 0
50
+ f0 = f0[0].cpu().numpy()
51
+ f0 = f0[1:] # Get rid of extra first frame
52
+ return f0
53
+
54
+ def get_f0_crepe_computation(
55
+ x,
56
+ sr,
57
+ f0_min,
58
+ f0_max,
59
+ hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
60
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
61
+ ):
62
+ x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
63
+ x /= np.quantile(np.abs(x), 0.999)
64
+ torch_device = get_optimal_torch_device()
65
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
66
+ audio = torch.unsqueeze(audio, dim=0)
67
+ if audio.ndim == 2 and audio.shape[0] > 1:
68
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
69
+ audio = audio.detach()
70
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
71
+ pitch: Tensor = torchcrepe.predict(
72
+ audio,
73
+ sr,
74
+ hop_length,
75
+ f0_min,
76
+ f0_max,
77
+ model,
78
+ batch_size=hop_length * 2,
79
+ device=torch_device,
80
+ pad=True
81
+ )
82
+ p_len = x.shape[0] // hop_length
83
+ # Resize the pitch for final f0
84
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
85
+ source[source < 0.001] = np.nan
86
+ target = np.interp(
87
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
88
+ np.arange(0, len(source)),
89
+ source
90
+ )
91
+ f0 = np.nan_to_num(target)
92
+ f0 = f0[1:] # Get rid of extra first frame
93
+ return f0 # Resized f0
94
+
95
+
96
+ def compute_f0(
97
+ path: str,
98
+ f0_method: str,
99
+ fs: int,
100
+ hop: int,
101
+ f0_max: float,
102
+ f0_min: float,
103
+ ):
104
+ x = load_audio(path, fs)
105
+ if f0_method == "harvest":
106
+ f0, t = pyworld.harvest(
107
+ x.astype(np.double),
108
+ fs=fs,
109
+ f0_ceil=f0_max,
110
+ f0_floor=f0_min,
111
+ frame_period=1000 * hop / fs,
112
+ )
113
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
114
+ elif f0_method == "dio":
115
+ f0, t = pyworld.dio(
116
+ x.astype(np.double),
117
+ fs=fs,
118
+ f0_ceil=f0_max,
119
+ f0_floor=f0_min,
120
+ frame_period=1000 * hop / fs,
121
+ )
122
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
123
+ elif f0_method == "mangio-crepe":
124
+ f0 = get_f0_crepe_computation(x, fs, f0_min, f0_max, 160, "full")
125
+ elif f0_method == "crepe":
126
+ f0 = get_f0_official_crepe_computation(x.astype(np.double), fs, f0_min, f0_max, "full")
127
+ return f0
128
+
129
+
130
+ def coarse_f0(f0, f0_bin, f0_mel_min, f0_mel_max):
131
+ f0_mel = 1127 * np.log(1 + f0 / 700)
132
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
133
+ f0_mel_max - f0_mel_min
134
+ ) + 1
135
+
136
+ # use 0 or 1
137
+ f0_mel[f0_mel <= 1] = 1
138
+ f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
139
+ f0_coarse = np.rint(f0_mel).astype(np.int)
140
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
141
+ f0_coarse.max(),
142
+ f0_coarse.min(),
143
+ )
144
+ return f0_coarse
145
+
146
+
147
+ def processor(paths, f0_method, samplerate=16000, hop_size=160, process_id=0):
148
+ fs = samplerate
149
+ hop = hop_size
150
+
151
+ f0_bin = 256
152
+ f0_max = 1100.0
153
+ f0_min = 50.0
154
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
155
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
156
+ if len(paths) != 0:
157
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(
158
+ tqdm(paths, position=1 + process_id)
159
+ ):
160
+ try:
161
+ if (
162
+ os.path.exists(opt_path1 + ".npy") == True
163
+ and os.path.exists(opt_path2 + ".npy") == True
164
+ ):
165
+ continue
166
+ featur_pit = compute_f0(inp_path, f0_method, fs, hop, f0_max, f0_min)
167
+ np.save(
168
+ opt_path2,
169
+ featur_pit,
170
+ allow_pickle=False,
171
+ ) # nsf
172
+ coarse_pit = coarse_f0(featur_pit, f0_bin, f0_mel_min, f0_mel_max)
173
+ np.save(
174
+ opt_path1,
175
+ coarse_pit,
176
+ allow_pickle=False,
177
+ ) # ori
178
+ except:
179
+ print(f"f0 failed {idx}: {inp_path} {traceback.format_exc()}")
180
+
181
+
182
+ def run(training_dir: str, num_processes: int, f0_method: str):
183
+ paths = []
184
+ dataset_dir = os.path.join(training_dir, "1_16k_wavs")
185
+ opt_dir_f0 = os.path.join(training_dir, "2a_f0")
186
+ opt_dir_f0_nsf = os.path.join(training_dir, "2b_f0nsf")
187
+
188
+ if os.path.exists(opt_dir_f0) and os.path.exists(opt_dir_f0_nsf):
189
+ return
190
+
191
+ os.makedirs(opt_dir_f0, exist_ok=True)
192
+ os.makedirs(opt_dir_f0_nsf, exist_ok=True)
193
+
194
+ names = []
195
+
196
+ for pathname in sorted(list(os.listdir(dataset_dir))):
197
+ if os.path.isdir(os.path.join(dataset_dir, pathname)):
198
+ for f in sorted(list(os.listdir(os.path.join(dataset_dir, pathname)))):
199
+ if "spec" in f:
200
+ continue
201
+ names.append(os.path.join(pathname, f))
202
+ else:
203
+ names.append(pathname)
204
+
205
+ for name in names: # dataset_dir/{05d}/file.ext
206
+ filepath = os.path.join(dataset_dir, name)
207
+ if "spec" in filepath:
208
+ continue
209
+ opt_filepath_f0 = os.path.join(opt_dir_f0, name)
210
+ opt_filepath_f0_nsf = os.path.join(opt_dir_f0_nsf, name)
211
+ paths.append([filepath, opt_filepath_f0, opt_filepath_f0_nsf])
212
+
213
+ for dir in set([(os.path.dirname(p[1]), os.path.dirname(p[2])) for p in paths]):
214
+ os.makedirs(dir[0], exist_ok=True)
215
+ os.makedirs(dir[1], exist_ok=True)
216
+
217
+ with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executer:
218
+ for i in range(num_processes):
219
+ executer.submit(processor, paths[i::num_processes], f0_method, process_id=i)
220
+
221
+ processor(paths, f0_method)
lib/rvc/preprocessing/extract_feature.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing as mp
2
+ import os
3
+ import traceback
4
+ from concurrent.futures import ProcessPoolExecutor
5
+ from typing import *
6
+
7
+ import numpy as np
8
+ import soundfile as sf
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from fairseq import checkpoint_utils
12
+ from tqdm import tqdm
13
+
14
+ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
15
+ MODELS_DIR = os.path.join(ROOT_DIR, "models")
16
+ EMBEDDINGS_LIST = {
17
+ "hubert-base-japanese": (
18
+ "rinna_hubert_base_jp.pt",
19
+ "hubert-base-japanese",
20
+ "local",
21
+ ),
22
+ "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
23
+ }
24
+
25
+ def get_embedder(embedder_name):
26
+ if embedder_name in EMBEDDINGS_LIST:
27
+ return EMBEDDINGS_LIST[embedder_name]
28
+ return None
29
+
30
+
31
+ def load_embedder(embedder_path: str, device):
32
+ try:
33
+ models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task(
34
+ [embedder_path],
35
+ suffix="",
36
+ )
37
+ embedder_model = models[0]
38
+ embedder_model = embedder_model.to(device)
39
+ if device != "cpu":
40
+ embedder_model = embedder_model.half()
41
+ else:
42
+ embedder_model = embedder_model.float()
43
+ embedder_model.eval()
44
+ except Exception as e:
45
+ print(f"Error: {e} {embedder_path}")
46
+ traceback.print_exc()
47
+
48
+ return embedder_model, cfg
49
+
50
+
51
+ # wave must be 16k, hop_size=320
52
+ def readwave(wav_path, normalize=False):
53
+ wav, sr = sf.read(wav_path)
54
+ assert sr == 16000
55
+ feats = torch.from_numpy(wav).float()
56
+ if feats.dim() == 2: # double channels
57
+ feats = feats.mean(-1)
58
+ assert feats.dim() == 1, feats.dim()
59
+ if normalize:
60
+ with torch.no_grad():
61
+ feats = F.layer_norm(feats, feats.shape)
62
+ feats = feats.view(1, -1)
63
+ return feats
64
+
65
+
66
+ def processor(
67
+ todo: List[str],
68
+ device: torch.device,
69
+ embedder_path: str,
70
+ embedder_load_from: str,
71
+ embedding_channel: bool,
72
+ embedding_output_layer: int,
73
+ wav_dir: str,
74
+ out_dir: str,
75
+ process_id: int,
76
+ ):
77
+ half_support = (
78
+ device.type == "cuda" and torch.cuda.get_device_capability(device)[0] >= 5.3
79
+ )
80
+ is_feats_dim_768 = embedding_channel == 768
81
+
82
+ if embedder_load_from == "local" and not os.path.exists(embedder_path):
83
+ return f"Embedder not found: {embedder_path}"
84
+
85
+ model, cfg = load_embedder(embedder_path, device)
86
+
87
+ for file in tqdm(todo, position=1 + process_id):
88
+ try:
89
+ if file.endswith(".wav"):
90
+ wav_filepath = os.path.join(wav_dir, file)
91
+ out_filepath = os.path.join(out_dir, file.replace("wav", "npy"))
92
+
93
+ if os.path.exists(out_filepath):
94
+ continue
95
+
96
+ os.makedirs(os.path.dirname(out_filepath), exist_ok=True)
97
+
98
+ is_normalize = False if cfg is None else cfg.task.normalize
99
+ feats = readwave(wav_filepath, normalize=is_normalize)
100
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
101
+ if isinstance(model, tuple):
102
+ feats = model[0](
103
+ feats.squeeze(0).squeeze(0).to(device),
104
+ return_tensors="pt",
105
+ sampling_rate=16000,
106
+ )
107
+ if half_support:
108
+ feats = feats.input_values.to(device).half()
109
+ else:
110
+ feats = feats.input_values.to(device).float()
111
+
112
+ with torch.no_grad():
113
+ if half_support:
114
+ if is_feats_dim_768:
115
+ feats = model[1](feats).last_hidden_state
116
+ else:
117
+ feats = model[1](feats).extract_features
118
+ else:
119
+ if is_feats_dim_768:
120
+ feats = model[1].float()(feats).last_hidden_state
121
+ else:
122
+ feats = model[1].float()(feats).extract_features
123
+ else:
124
+ inputs = {
125
+ "source": feats.half().to(device)
126
+ if half_support
127
+ else feats.to(device),
128
+ "padding_mask": padding_mask.to(device),
129
+ "output_layer": embedding_output_layer,
130
+ }
131
+
132
+ # なんかまだこの時点でfloat16なので改めて変換
133
+ if not half_support:
134
+ model = model.float()
135
+ inputs["source"] = inputs["source"].float()
136
+
137
+ with torch.no_grad():
138
+ logits = model.extract_features(**inputs)
139
+ if is_feats_dim_768:
140
+ feats = logits[0]
141
+ else:
142
+ feats = model.final_proj(logits[0])
143
+
144
+ feats = feats.squeeze(0).float().cpu().numpy()
145
+ if np.isnan(feats).sum() == 0:
146
+ np.save(out_filepath, feats, allow_pickle=False)
147
+ else:
148
+ print(f"{file} contains nan")
149
+ except Exception as e:
150
+ print(f"Error: {e} {file}")
151
+ traceback.print_exc()
152
+
153
+
154
+ def run(
155
+ training_dir: str,
156
+ embedder_path: str,
157
+ embedder_load_from: str,
158
+ embedding_channel: int,
159
+ embedding_output_layer: int,
160
+ gpu_ids: List[int],
161
+ device: Optional[Union[torch.device, str]] = None,
162
+ ):
163
+ wav_dir = os.path.join(training_dir, "1_16k_wavs")
164
+ out_dir = os.path.join(training_dir, "3_feature256")
165
+
166
+ num_gpus = len(gpu_ids)
167
+
168
+ for gpu_id in gpu_ids:
169
+ if num_gpus < gpu_id + 1:
170
+ print(f"GPU {gpu_id} is not available")
171
+ return
172
+
173
+ if os.path.exists(out_dir):
174
+ return
175
+
176
+ os.makedirs(out_dir, exist_ok=True)
177
+
178
+ todo = [
179
+ os.path.join(dir, f)
180
+ for dir in sorted(list(os.listdir(wav_dir)))
181
+ if os.path.isdir(os.path.join(wav_dir, dir))
182
+ for f in sorted(list(os.listdir(os.path.join(wav_dir, dir))))
183
+ ]
184
+
185
+ if device is not None:
186
+ if type(device) == str:
187
+ device = torch.device(device)
188
+ if device.type == "mps":
189
+ device = torch.device(
190
+ "cpu"
191
+ ) # Mac(MPS) crashes when multiprocess, so change to CPU.
192
+ processor(
193
+ todo,
194
+ device,
195
+ embedder_path,
196
+ embedder_load_from,
197
+ embedding_channel,
198
+ embedding_output_layer,
199
+ wav_dir,
200
+ out_dir,
201
+ process_id=0,
202
+ )
203
+ else:
204
+ with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executor:
205
+ for i, id in enumerate(gpu_ids):
206
+ executor.submit(
207
+ processor,
208
+ todo[i::num_gpus],
209
+ torch.device(f"cuda:{id}"),
210
+ embedder_path,
211
+ embedder_load_from,
212
+ embedding_channel,
213
+ embedding_output_layer,
214
+ wav_dir,
215
+ out_dir,
216
+ process_id=i,
217
+ )
lib/rvc/preprocessing/slicer.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ # This function is obtained from librosa.
5
+ def get_rms(
6
+ y,
7
+ frame_length=2048,
8
+ hop_length=512,
9
+ pad_mode="constant",
10
+ ):
11
+ padding = (int(frame_length // 2), int(frame_length // 2))
12
+ y = np.pad(y, padding, mode=pad_mode)
13
+
14
+ axis = -1
15
+ # put our new within-frame axis at the end for now
16
+ out_strides = y.strides + tuple([y.strides[axis]])
17
+ # Reduce the shape on the framing axis
18
+ x_shape_trimmed = list(y.shape)
19
+ x_shape_trimmed[axis] -= frame_length - 1
20
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22
+ if axis < 0:
23
+ target_axis = axis - 1
24
+ else:
25
+ target_axis = axis + 1
26
+ xw = np.moveaxis(xw, -1, target_axis)
27
+ # Downsample along the target axis
28
+ slices = [slice(None)] * xw.ndim
29
+ slices[axis] = slice(0, None, hop_length)
30
+ x = xw[tuple(slices)]
31
+
32
+ # Calculate power
33
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34
+
35
+ return np.sqrt(power)
36
+
37
+
38
+ class Slicer:
39
+ def __init__(
40
+ self,
41
+ sr: int,
42
+ threshold: float = -40.0,
43
+ min_length: int = 5000,
44
+ min_interval: int = 300,
45
+ hop_size: int = 20,
46
+ max_sil_kept: int = 5000,
47
+ ):
48
+ if not min_length >= min_interval >= hop_size:
49
+ raise ValueError(
50
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51
+ )
52
+ if not max_sil_kept >= hop_size:
53
+ raise ValueError(
54
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
55
+ )
56
+ min_interval = sr * min_interval / 1000
57
+ self.threshold = 10 ** (threshold / 20.0)
58
+ self.hop_size = round(sr * hop_size / 1000)
59
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
60
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
61
+ self.min_interval = round(min_interval / self.hop_size)
62
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63
+
64
+ def _apply_slice(self, waveform, begin, end):
65
+ if len(waveform.shape) > 1:
66
+ return waveform[
67
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68
+ ]
69
+ else:
70
+ return waveform[
71
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72
+ ]
73
+
74
+ # @timeit
75
+ def slice(self, waveform):
76
+ if len(waveform.shape) > 1:
77
+ samples = waveform.mean(axis=0)
78
+ else:
79
+ samples = waveform
80
+ if samples.shape[0] <= self.min_length:
81
+ return [waveform]
82
+ rms_list = get_rms(
83
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
84
+ ).squeeze(0)
85
+ sil_tags = []
86
+ silence_start = None
87
+ clip_start = 0
88
+ for i, rms in enumerate(rms_list):
89
+ # Keep looping while frame is silent.
90
+ if rms < self.threshold:
91
+ # Record start of silent frames.
92
+ if silence_start is None:
93
+ silence_start = i
94
+ continue
95
+ # Keep looping while frame is not silent and silence start has not been recorded.
96
+ if silence_start is None:
97
+ continue
98
+ # Clear recorded silence start if interval is not enough or clip is too short
99
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100
+ need_slice_middle = (
101
+ i - silence_start >= self.min_interval
102
+ and i - clip_start >= self.min_length
103
+ )
104
+ if not is_leading_silence and not need_slice_middle:
105
+ silence_start = None
106
+ continue
107
+ # Need slicing. Record the range of silent frames to be removed.
108
+ if i - silence_start <= self.max_sil_kept:
109
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
110
+ if silence_start == 0:
111
+ sil_tags.append((0, pos))
112
+ else:
113
+ sil_tags.append((pos, pos))
114
+ clip_start = pos
115
+ elif i - silence_start <= self.max_sil_kept * 2:
116
+ pos = rms_list[
117
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118
+ ].argmin()
119
+ pos += i - self.max_sil_kept
120
+ pos_l = (
121
+ rms_list[
122
+ silence_start : silence_start + self.max_sil_kept + 1
123
+ ].argmin()
124
+ + silence_start
125
+ )
126
+ pos_r = (
127
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
128
+ + i
129
+ - self.max_sil_kept
130
+ )
131
+ if silence_start == 0:
132
+ sil_tags.append((0, pos_r))
133
+ clip_start = pos_r
134
+ else:
135
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136
+ clip_start = max(pos_r, pos)
137
+ else:
138
+ pos_l = (
139
+ rms_list[
140
+ silence_start : silence_start + self.max_sil_kept + 1
141
+ ].argmin()
142
+ + silence_start
143
+ )
144
+ pos_r = (
145
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
146
+ + i
147
+ - self.max_sil_kept
148
+ )
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos_r))
151
+ else:
152
+ sil_tags.append((pos_l, pos_r))
153
+ clip_start = pos_r
154
+ silence_start = None
155
+ # Deal with trailing silence.
156
+ total_frames = rms_list.shape[0]
157
+ if (
158
+ silence_start is not None
159
+ and total_frames - silence_start >= self.min_interval
160
+ ):
161
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
162
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163
+ sil_tags.append((pos, total_frames + 1))
164
+ # Apply and return slices.
165
+ if len(sil_tags) == 0:
166
+ return [waveform]
167
+ else:
168
+ chunks = []
169
+ if sil_tags[0][0] > 0:
170
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
171
+ for i in range(len(sil_tags) - 1):
172
+ chunks.append(
173
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
174
+ )
175
+ if sil_tags[-1][1] < total_frames:
176
+ chunks.append(
177
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
178
+ )
179
+ return chunks
lib/rvc/preprocessing/split.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import operator
2
+ import os
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from typing import *
5
+
6
+ import librosa
7
+ import numpy as np
8
+ import scipy.signal as signal
9
+ from scipy.io import wavfile
10
+ from tqdm import tqdm
11
+
12
+ from lib.rvc.utils import load_audio
13
+
14
+ from .slicer import Slicer
15
+
16
+
17
+ def norm_write(
18
+ tmp_audio: np.ndarray,
19
+ idx0: int,
20
+ idx1: int,
21
+ speaker_id: int,
22
+ outdir: str,
23
+ outdir_16k: str,
24
+ sampling_rate: int,
25
+ max: float,
26
+ alpha: float,
27
+ is_normalize: bool,
28
+ ):
29
+ if is_normalize:
30
+ tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + (
31
+ 1 - alpha
32
+ ) * tmp_audio
33
+ else:
34
+ # clip level to max (cause sometimes when floating point decoding)
35
+ audio_min = np.min(tmp_audio)
36
+ if audio_min < -max:
37
+ tmp_audio = tmp_audio / -audio_min * max
38
+ audio_max = np.max(tmp_audio)
39
+ if audio_max > max:
40
+ tmp_audio = tmp_audio / audio_max * max
41
+
42
+ wavfile.write(
43
+ os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
44
+ sampling_rate,
45
+ tmp_audio.astype(np.float32),
46
+ )
47
+
48
+ tmp_audio = librosa.resample(
49
+ tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq"
50
+ )
51
+ wavfile.write(
52
+ os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
53
+ 16000,
54
+ tmp_audio.astype(np.float32),
55
+ )
56
+
57
+
58
+ def write_mute(
59
+ mute_wave_filename: str,
60
+ speaker_id: int,
61
+ outdir: str,
62
+ outdir_16k: str,
63
+ sampling_rate: int,
64
+ ):
65
+ tmp_audio = load_audio(mute_wave_filename, sampling_rate)
66
+ wavfile.write(
67
+ os.path.join(outdir, f"{speaker_id:05}", "mute.wav"),
68
+ sampling_rate,
69
+ tmp_audio.astype(np.float32),
70
+ )
71
+ tmp_audio = librosa.resample(
72
+ tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq"
73
+ )
74
+ wavfile.write(
75
+ os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"),
76
+ 16000,
77
+ tmp_audio.astype(np.float32),
78
+ )
79
+
80
+
81
+ def pipeline(
82
+ slicer: Slicer,
83
+ datasets: List[Tuple[str, int]], # List[(path, speaker_id)]
84
+ outdir: str,
85
+ outdir_16k: str,
86
+ sampling_rate: int,
87
+ is_normalize: bool,
88
+ process_id: int = 0,
89
+ ):
90
+ per = 3.7
91
+ overlap = 0.3
92
+ tail = per + overlap
93
+ max = 0.95
94
+ alpha = 0.8
95
+
96
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate)
97
+
98
+ for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id):
99
+ audio = load_audio(wave_filename, sampling_rate)
100
+ audio = signal.lfilter(bh, ah, audio)
101
+
102
+ idx1 = 0
103
+ for audio in slicer.slice(audio):
104
+ i = 0
105
+ while 1:
106
+ start = int(sampling_rate * (per - overlap) * i)
107
+ i += 1
108
+ if len(audio[start:]) > tail * sampling_rate:
109
+ tmp_audio = audio[start : start + int(per * sampling_rate)]
110
+ norm_write(
111
+ tmp_audio,
112
+ index,
113
+ idx1,
114
+ speaker_id,
115
+ outdir,
116
+ outdir_16k,
117
+ sampling_rate,
118
+ max,
119
+ alpha,
120
+ is_normalize,
121
+ )
122
+ idx1 += 1
123
+ else:
124
+ tmp_audio = audio[start:]
125
+ break
126
+ norm_write(
127
+ tmp_audio,
128
+ index,
129
+ idx1,
130
+ speaker_id,
131
+ outdir,
132
+ outdir_16k,
133
+ sampling_rate,
134
+ max,
135
+ alpha,
136
+ is_normalize,
137
+ )
138
+ idx1 += 1
139
+
140
+
141
+ def preprocess_audio(
142
+ datasets: List[Tuple[str, int]], # List[(path, speaker_id)]
143
+ sampling_rate: int,
144
+ num_processes: int,
145
+ training_dir: str,
146
+ is_normalize: bool,
147
+ mute_wav_path: str,
148
+ ):
149
+ waves_dir = os.path.join(training_dir, "0_gt_wavs")
150
+ waves16k_dir = os.path.join(training_dir, "1_16k_wavs")
151
+ if os.path.exists(waves_dir) and os.path.exists(waves16k_dir):
152
+ return
153
+
154
+ for speaker_id in set([spk for _, spk in datasets]):
155
+ os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True)
156
+ os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True)
157
+
158
+ all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))]
159
+
160
+ # n of datasets per process
161
+ process_all_nums = [len(all) // num_processes] * num_processes
162
+ # add residual datasets
163
+ for i in range(len(all) % num_processes):
164
+ process_all_nums[i] += 1
165
+
166
+ assert len(all) == sum(process_all_nums), print(
167
+ f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}"
168
+ )
169
+
170
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
171
+ all_index = 0
172
+ for i in range(num_processes):
173
+ data = all[all_index : all_index + process_all_nums[i]]
174
+ slicer = Slicer(
175
+ sr=sampling_rate,
176
+ threshold=-42,
177
+ min_length=1500,
178
+ min_interval=400,
179
+ hop_size=15,
180
+ max_sil_kept=500,
181
+ )
182
+ executor.submit(
183
+ pipeline,
184
+ slicer,
185
+ data,
186
+ waves_dir,
187
+ waves16k_dir,
188
+ sampling_rate,
189
+ is_normalize,
190
+ process_id=i,
191
+ )
192
+ all_index += process_all_nums[i]
193
+
194
+ for speaker_id in set([spk for _, spk in datasets]):
195
+ write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate)
lib/rvc/train.py ADDED
@@ -0,0 +1,998 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import operator
4
+ import os
5
+ import shutil
6
+ import time
7
+ from random import shuffle
8
+ from typing import *
9
+
10
+ import faiss
11
+ import numpy as np
12
+ import torch
13
+ import torch.distributed as dist
14
+ import torch.multiprocessing as mp
15
+ import torchaudio
16
+ import tqdm
17
+ from sklearn.cluster import MiniBatchKMeans
18
+ from torch.cuda.amp import GradScaler, autocast
19
+ from torch.nn import functional as F
20
+ from torch.nn.parallel import DistributedDataParallel as DDP
21
+ from torch.utils.data import DataLoader
22
+ from torch.utils.tensorboard import SummaryWriter
23
+
24
+ from . import commons, utils
25
+ from .checkpoints import save
26
+ from .config import DatasetMetadata, TrainConfig
27
+ from .data_utils import (DistributedBucketSampler, TextAudioCollate,
28
+ TextAudioCollateMultiNSFsid, TextAudioLoader,
29
+ TextAudioLoaderMultiNSFsid)
30
+ from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
31
+ from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
32
+ from .models import (MultiPeriodDiscriminator, SynthesizerTrnMs256NSFSid,
33
+ SynthesizerTrnMs256NSFSidNono)
34
+ from .preprocessing.extract_feature import (MODELS_DIR, get_embedder,
35
+ load_embedder)
36
+
37
+
38
+ def is_audio_file(file: str):
39
+ if "." not in file:
40
+ return False
41
+ ext = os.path.splitext(file)[1]
42
+ return ext.lower() in [
43
+ ".wav",
44
+ ".flac",
45
+ ".ogg",
46
+ ".mp3",
47
+ ".m4a",
48
+ ".wma",
49
+ ".aiff",
50
+ ]
51
+
52
+
53
+ def glob_dataset(
54
+ glob_str: str,
55
+ speaker_id: int,
56
+ multiple_speakers: bool = False,
57
+ recursive: bool = True,
58
+ training_dir: str = ".",
59
+ ):
60
+ globs = glob_str.split(",")
61
+ speaker_count = 0
62
+ datasets_speakers = []
63
+ speaker_to_id_mapping = {}
64
+ for glob_str in globs:
65
+ if os.path.isdir(glob_str):
66
+ if multiple_speakers:
67
+ # Multispeaker format:
68
+ # dataset_path/
69
+ # - speakername/
70
+ # - {wav name here}.wav
71
+ # - ...
72
+ # - next_speakername/
73
+ # - {wav name here}.wav
74
+ # - ...
75
+ # - ...
76
+ print("Multispeaker dataset enabled; Processing speakers.")
77
+ for dir in tqdm.tqdm(os.listdir(glob_str)):
78
+ print("Speaker ID " + str(speaker_count) + ": " + dir)
79
+ speaker_to_id_mapping[dir] = speaker_count
80
+ speaker_path = glob_str + "/" + dir
81
+ for audio in tqdm.tqdm(os.listdir(speaker_path)):
82
+ if is_audio_file(glob_str + "/" + dir + "/" + audio):
83
+ datasets_speakers.append((glob_str + "/" + dir + "/" + audio, speaker_count))
84
+ speaker_count += 1
85
+ with open(os.path.join(training_dir, "speaker_info.json"), "w") as outfile:
86
+ print("Dumped speaker info to {}".format(os.path.join(training_dir, "speaker_info.json")))
87
+ json.dump(speaker_to_id_mapping, outfile)
88
+ continue # Skip the normal speaker extend
89
+
90
+ glob_str = os.path.join(glob_str, "**", "*")
91
+ print("Single speaker dataset enabled; Processing speaker as ID " + str(speaker_id) + ".")
92
+ datasets_speakers.extend(
93
+ [
94
+ (file, speaker_id)
95
+ for file in glob.iglob(glob_str, recursive=recursive)
96
+ if is_audio_file(file)
97
+ ]
98
+ )
99
+
100
+ return sorted(datasets_speakers)
101
+
102
+
103
+ def create_dataset_meta(training_dir: str, f0: bool):
104
+ gt_wavs_dir = os.path.join(training_dir, "0_gt_wavs")
105
+ co256_dir = os.path.join(training_dir, "3_feature256")
106
+
107
+ def list_data(dir: str):
108
+ files = []
109
+ for subdir in os.listdir(dir):
110
+ speaker_dir = os.path.join(dir, subdir)
111
+ for name in os.listdir(speaker_dir):
112
+ files.append(os.path.join(subdir, name.split(".")[0]))
113
+ return files
114
+
115
+ names = set(list_data(gt_wavs_dir)) & set(list_data(co256_dir))
116
+
117
+ if f0:
118
+ f0_dir = os.path.join(training_dir, "2a_f0")
119
+ f0nsf_dir = os.path.join(training_dir, "2b_f0nsf")
120
+ names = names & set(list_data(f0_dir)) & set(list_data(f0nsf_dir))
121
+
122
+ meta = {
123
+ "files": {},
124
+ }
125
+
126
+ for name in names:
127
+ speaker_id = os.path.dirname(name).split("_")[0]
128
+ speaker_id = int(speaker_id) if speaker_id.isdecimal() else 0
129
+ if f0:
130
+ gt_wav_path = os.path.join(gt_wavs_dir, f"{name}.wav")
131
+ co256_path = os.path.join(co256_dir, f"{name}.npy")
132
+ f0_path = os.path.join(f0_dir, f"{name}.wav.npy")
133
+ f0nsf_path = os.path.join(f0nsf_dir, f"{name}.wav.npy")
134
+ meta["files"][name] = {
135
+ "gt_wav": gt_wav_path,
136
+ "co256": co256_path,
137
+ "f0": f0_path,
138
+ "f0nsf": f0nsf_path,
139
+ "speaker_id": speaker_id,
140
+ }
141
+ else:
142
+ gt_wav_path = os.path.join(gt_wavs_dir, f"{name}.wav")
143
+ co256_path = os.path.join(co256_dir, f"{name}.npy")
144
+ meta["files"][name] = {
145
+ "gt_wav": gt_wav_path,
146
+ "co256": co256_path,
147
+ "speaker_id": speaker_id,
148
+ }
149
+
150
+ with open(os.path.join(training_dir, "meta.json"), "w") as f:
151
+ json.dump(meta, f, indent=2)
152
+
153
+
154
+ def change_speaker(net_g, speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths):
155
+ """
156
+ random change formant
157
+ inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
158
+ """
159
+ N = phone.shape[0]
160
+ device = phone.device
161
+ dtype = phone.dtype
162
+
163
+ f0_bin = 256
164
+ f0_max = 1100.0
165
+ f0_min = 50.0
166
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
167
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
168
+
169
+ pitch_median = torch.median(pitchf, 1).values
170
+ lo = 75. + 25. * (pitch_median >= 200).to(dtype=dtype)
171
+ hi = 250. + 150. * (pitch_median >= 200).to(dtype=dtype)
172
+ pitch_median = torch.clip(pitch_median, lo, hi).unsqueeze(1)
173
+
174
+ shift_pitch = torch.exp2((1. - 2. * torch.rand(N)) / 4).unsqueeze(1).to(device, dtype) # ピッチを半オクターブの範囲でずらす
175
+
176
+ new_sid = np.random.choice(np.arange(len(speaker_info))[speaker_info > 0], size=N)
177
+ rel_pitch = pitchf / pitch_median
178
+ new_pitch_median = torch.from_numpy(speaker_info[new_sid]).to(device, dtype).unsqueeze(1) * shift_pitch
179
+ new_pitchf = new_pitch_median * rel_pitch
180
+ new_sid = torch.from_numpy(new_sid).to(device)
181
+
182
+ new_pitch = 1127. * torch.log(1. + new_pitchf / 700.)
183
+ new_pitch = (pitch - f0_mel_min) * (f0_bin - 2.) / (f0_mel_max - f0_mel_min) + 1.
184
+ new_pitch = torch.clip(new_pitch, 1, f0_bin - 1).to(dtype=torch.int)
185
+
186
+ aug_wave = net_g.infer(phone, phone_lengths, new_pitch, new_pitchf, new_sid)[0]
187
+ aug_wave_16k = torchaudio.functional.resample(aug_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
188
+ padding_mask = torch.arange(aug_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)
189
+
190
+ inputs = {
191
+ "source": aug_wave_16k.to(device, dtype),
192
+ "padding_mask": padding_mask.to(device),
193
+ "output_layer": embedding_output_layer
194
+ }
195
+ logits = embedder.extract_features(**inputs)
196
+ if phone.shape[-1] == 768:
197
+ feats = logits[0]
198
+ else:
199
+ feats = embedder.final_proj(logits[0])
200
+ feats = torch.repeat_interleave(feats, 2, 1)
201
+ new_phone = torch.zeros(phone.shape).to(device, dtype)
202
+ new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
203
+ return new_phone.to(device), aug_wave
204
+
205
+
206
+ def change_speaker_nono(net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths):
207
+ """
208
+ random change formant
209
+ inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
210
+ """
211
+ N = phone.shape[0]
212
+ device = phone.device
213
+ dtype = phone.dtype
214
+
215
+ new_sid = np.random.randint(net_g.spk_embed_dim, size=N)
216
+ new_sid = torch.from_numpy(new_sid).to(device)
217
+
218
+ aug_wave = net_g.infer(phone, phone_lengths, new_sid)[0]
219
+ aug_wave_16k = torchaudio.functional.resample(aug_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
220
+ padding_mask = torch.arange(aug_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)
221
+
222
+ inputs = {
223
+ "source": aug_wave_16k.to(device, dtype),
224
+ "padding_mask": padding_mask.to(device),
225
+ "output_layer": embedding_output_layer
226
+ }
227
+
228
+ logits = embedder.extract_features(**inputs)
229
+ if phone.shape[-1] == 768:
230
+ feats = logits[0]
231
+ else:
232
+ feats = embedder.final_proj(logits[0])
233
+ feats = torch.repeat_interleave(feats, 2, 1)
234
+ new_phone = torch.zeros(phone.shape).to(device, dtype)
235
+ new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
236
+ return new_phone.to(device), aug_wave
237
+
238
+
239
+ def train_index(
240
+ training_dir: str,
241
+ model_name: str,
242
+ out_dir: str,
243
+ emb_ch: int,
244
+ num_cpu_process: int,
245
+ maximum_index_size: Optional[int],
246
+ ):
247
+ checkpoint_path = os.path.join(out_dir, model_name)
248
+ feature_256_dir = os.path.join(training_dir, "3_feature256")
249
+ index_dir = os.path.join(os.path.dirname(checkpoint_path), f"{model_name}_index")
250
+ os.makedirs(index_dir, exist_ok=True)
251
+ for speaker_id in tqdm.tqdm(
252
+ sorted([dir for dir in os.listdir(feature_256_dir) if dir.isdecimal()])
253
+ ):
254
+ feature_256_spk_dir = os.path.join(feature_256_dir, speaker_id)
255
+ speaker_id = int(speaker_id)
256
+ npys = []
257
+ for name in [
258
+ os.path.join(feature_256_spk_dir, file)
259
+ for file in os.listdir(feature_256_spk_dir)
260
+ if file.endswith(".npy")
261
+ ]:
262
+ phone = np.load(os.path.join(feature_256_spk_dir, name))
263
+ npys.append(phone)
264
+
265
+ # shuffle big_npy to prevent reproducing the sound source
266
+ big_npy = np.concatenate(npys, 0)
267
+ big_npy_idx = np.arange(big_npy.shape[0])
268
+ np.random.shuffle(big_npy_idx)
269
+ big_npy = big_npy[big_npy_idx]
270
+
271
+ if not maximum_index_size is None and big_npy.shape[0] > maximum_index_size:
272
+ kmeans = MiniBatchKMeans(
273
+ n_clusters=maximum_index_size,
274
+ batch_size=256 * num_cpu_process,
275
+ init="random",
276
+ compute_labels=False,
277
+ )
278
+ kmeans.fit(big_npy)
279
+ big_npy = kmeans.cluster_centers_
280
+
281
+ # recommend parameter in https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
282
+ emb_ch = big_npy.shape[1]
283
+ emb_ch_half = emb_ch // 2
284
+ n_ivf = int(8 * np.sqrt(big_npy.shape[0]))
285
+ if big_npy.shape[0] >= 1_000_000:
286
+ index = faiss.index_factory(
287
+ emb_ch, f"IVF{n_ivf},PQ{emb_ch_half}x4fsr,RFlat"
288
+ )
289
+ else:
290
+ index = faiss.index_factory(emb_ch, f"IVF{n_ivf},Flat")
291
+
292
+ index.train(big_npy)
293
+ batch_size_add = 8192
294
+ for i in range(0, big_npy.shape[0], batch_size_add):
295
+ index.add(big_npy[i : i + batch_size_add])
296
+ np.save(
297
+ os.path.join(index_dir, f"{model_name}.{speaker_id}.big.npy"),
298
+ big_npy,
299
+ )
300
+ faiss.write_index(
301
+ index,
302
+ os.path.join(index_dir, f"{model_name}.{speaker_id}.index"),
303
+ )
304
+
305
+
306
+ def train_model(
307
+ gpus: List[int],
308
+ config: TrainConfig,
309
+ training_dir: str,
310
+ model_name: str,
311
+ out_dir: str,
312
+ sample_rate: int,
313
+ f0: bool,
314
+ batch_size: int,
315
+ augment: bool,
316
+ augment_path: Optional[str],
317
+ speaker_info_path: Optional[str],
318
+ cache_batch: bool,
319
+ total_epoch: int,
320
+ save_every_epoch: int,
321
+ save_wav_with_checkpoint: bool,
322
+ pretrain_g: str,
323
+ pretrain_d: str,
324
+ embedder_name: str,
325
+ embedding_output_layer: int,
326
+ save_only_last: bool = False,
327
+ device: Optional[Union[str, torch.device]] = None,
328
+ ):
329
+ os.environ["MASTER_ADDR"] = "localhost"
330
+ os.environ["MASTER_PORT"] = str(utils.find_empty_port())
331
+
332
+ deterministic = torch.backends.cudnn.deterministic
333
+ benchmark = torch.backends.cudnn.benchmark
334
+ PREV_CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
335
+
336
+ torch.backends.cudnn.deterministic = False
337
+ torch.backends.cudnn.benchmark = False
338
+
339
+ os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(gpu) for gpu in gpus])
340
+
341
+ start = time.perf_counter()
342
+
343
+ # Mac(MPS)でやると、mp.spawnでなんかトラブルが出るので普通にtraining_runnerを呼び出す。
344
+ if device is not None:
345
+ training_runner(
346
+ 0, # rank
347
+ 1, # world size
348
+ config,
349
+ training_dir,
350
+ model_name,
351
+ out_dir,
352
+ sample_rate,
353
+ f0,
354
+ batch_size,
355
+ augment,
356
+ augment_path,
357
+ speaker_info_path,
358
+ cache_batch,
359
+ total_epoch,
360
+ save_every_epoch,
361
+ save_wav_with_checkpoint,
362
+ pretrain_g,
363
+ pretrain_d,
364
+ embedder_name,
365
+ embedding_output_layer,
366
+ save_only_last,
367
+ device,
368
+ )
369
+ else:
370
+ mp.spawn(
371
+ training_runner,
372
+ nprocs=len(gpus),
373
+ args=(
374
+ len(gpus),
375
+ config,
376
+ training_dir,
377
+ model_name,
378
+ out_dir,
379
+ sample_rate,
380
+ f0,
381
+ batch_size,
382
+ augment,
383
+ augment_path,
384
+ speaker_info_path,
385
+ cache_batch,
386
+ total_epoch,
387
+ save_every_epoch,
388
+ save_wav_with_checkpoint,
389
+ pretrain_g,
390
+ pretrain_d,
391
+ embedder_name,
392
+ embedding_output_layer,
393
+ save_only_last,
394
+ device,
395
+ ),
396
+ )
397
+
398
+ end = time.perf_counter()
399
+
400
+ print(f"Time: {end - start}")
401
+
402
+ if PREV_CUDA_VISIBLE_DEVICES is None:
403
+ del os.environ["CUDA_VISIBLE_DEVICES"]
404
+ else:
405
+ os.environ["CUDA_VISIBLE_DEVICES"] = PREV_CUDA_VISIBLE_DEVICES
406
+
407
+ torch.backends.cudnn.deterministic = deterministic
408
+ torch.backends.cudnn.benchmark = benchmark
409
+
410
+
411
+ def training_runner(
412
+ rank: int,
413
+ world_size: List[int],
414
+ config: TrainConfig,
415
+ training_dir: str,
416
+ model_name: str,
417
+ out_dir: str,
418
+ sample_rate: int,
419
+ f0: bool,
420
+ batch_size: int,
421
+ augment: bool,
422
+ augment_path: Optional[str],
423
+ speaker_info_path: Optional[str],
424
+ cache_in_gpu: bool,
425
+ total_epoch: int,
426
+ save_every_epoch: int,
427
+ save_wav_with_checkpoint: bool,
428
+ pretrain_g: str,
429
+ pretrain_d: str,
430
+ embedder_name: str,
431
+ embedding_output_layer: int,
432
+ save_only_last: bool = False,
433
+ device: Optional[Union[str, torch.device]] = None,
434
+ ):
435
+ config.train.batch_size = batch_size
436
+ log_dir = os.path.join(training_dir, "logs")
437
+ state_dir = os.path.join(training_dir, "state")
438
+ training_files_path = os.path.join(training_dir, "meta.json")
439
+ training_meta = DatasetMetadata.parse_file(training_files_path)
440
+ embedder_out_channels = config.model.emb_channels
441
+
442
+ is_multi_process = world_size > 1
443
+
444
+ if device is not None:
445
+ if type(device) == str:
446
+ device = torch.device(device)
447
+
448
+ global_step = 0
449
+ is_main_process = rank == 0
450
+
451
+ if is_main_process:
452
+ os.makedirs(log_dir, exist_ok=True)
453
+ os.makedirs(state_dir, exist_ok=True)
454
+ writer = SummaryWriter(log_dir=log_dir)
455
+
456
+ if torch.cuda.is_available():
457
+ torch.cuda.empty_cache()
458
+
459
+ if not dist.is_initialized():
460
+ dist.init_process_group(
461
+ backend="gloo", init_method="env://", rank=rank, world_size=world_size
462
+ )
463
+
464
+ if is_multi_process:
465
+ torch.cuda.set_device(rank)
466
+
467
+ torch.manual_seed(config.train.seed)
468
+
469
+ if f0:
470
+ train_dataset = TextAudioLoaderMultiNSFsid(training_meta, config.data)
471
+ else:
472
+ train_dataset = TextAudioLoader(training_meta, config.data)
473
+
474
+ train_sampler = DistributedBucketSampler(
475
+ train_dataset,
476
+ config.train.batch_size * world_size,
477
+ [100, 200, 300, 400, 500, 600, 700, 800, 900],
478
+ num_replicas=world_size,
479
+ rank=rank,
480
+ shuffle=True,
481
+ )
482
+
483
+ if f0:
484
+ collate_fn = TextAudioCollateMultiNSFsid()
485
+ else:
486
+ collate_fn = TextAudioCollate()
487
+
488
+ train_loader = DataLoader(
489
+ train_dataset,
490
+ num_workers=4,
491
+ shuffle=False,
492
+ pin_memory=True,
493
+ collate_fn=collate_fn,
494
+ batch_sampler=train_sampler,
495
+ persistent_workers=True,
496
+ prefetch_factor=8,
497
+ )
498
+ speaker_info = None
499
+ if os.path.exists(os.path.join(training_dir, "speaker_info.json")):
500
+ with open(os.path.join(training_dir, "speaker_info.json"), "r") as f:
501
+ speaker_info = json.load(f)
502
+ config.model.spk_embed_dim = len(speaker_info)
503
+ if f0:
504
+ net_g = SynthesizerTrnMs256NSFSid(
505
+ config.data.filter_length // 2 + 1,
506
+ config.train.segment_size // config.data.hop_length,
507
+ **config.model.dict(),
508
+ is_half=False, # config.train.fp16_run,
509
+ sr=int(sample_rate[:-1] + "000"),
510
+ )
511
+ else:
512
+ net_g = SynthesizerTrnMs256NSFSidNono(
513
+ config.data.filter_length // 2 + 1,
514
+ config.train.segment_size // config.data.hop_length,
515
+ **config.model.dict(),
516
+ is_half=False, # config.train.fp16_run,
517
+ sr=int(sample_rate[:-1] + "000"),
518
+ )
519
+
520
+ if is_multi_process:
521
+ net_g = net_g.cuda(rank)
522
+ else:
523
+ net_g = net_g.to(device=device)
524
+
525
+ if config.version == "v1":
526
+ periods = [2, 3, 5, 7, 11, 17]
527
+ elif config.version == "v2":
528
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
529
+ net_d = MultiPeriodDiscriminator(config.model.use_spectral_norm, periods=periods)
530
+ if is_multi_process:
531
+ net_d = net_d.cuda(rank)
532
+ else:
533
+ net_d = net_d.to(device=device)
534
+
535
+ optim_g = torch.optim.AdamW(
536
+ net_g.parameters(),
537
+ config.train.learning_rate,
538
+ betas=config.train.betas,
539
+ eps=config.train.eps,
540
+ )
541
+ optim_d = torch.optim.AdamW(
542
+ net_d.parameters(),
543
+ config.train.learning_rate,
544
+ betas=config.train.betas,
545
+ eps=config.train.eps,
546
+ )
547
+
548
+ last_d_state = utils.latest_checkpoint_path(state_dir, "D_*.pth")
549
+ last_g_state = utils.latest_checkpoint_path(state_dir, "G_*.pth")
550
+
551
+ if last_d_state is None or last_g_state is None:
552
+ epoch = 1
553
+ global_step = 0
554
+ if os.path.exists(pretrain_g) and os.path.exists(pretrain_d):
555
+ net_g_state = torch.load(pretrain_g, map_location="cpu")["model"]
556
+ emb_spk_size = (config.model.spk_embed_dim, config.model.gin_channels)
557
+ emb_phone_size = (config.model.hidden_channels, config.model.emb_channels)
558
+ if emb_spk_size != net_g_state["emb_g.weight"].size():
559
+ original_weight = net_g_state["emb_g.weight"]
560
+ net_g_state["emb_g.weight"] = original_weight.mean(dim=0, keepdims=True) * torch.ones(emb_spk_size, device=original_weight.device, dtype=original_weight.dtype)
561
+ if emb_phone_size != net_g_state["enc_p.emb_phone.weight"].size():
562
+ # interpolate
563
+ orig_shape = net_g_state["enc_p.emb_phone.weight"].size()
564
+ if net_g_state["enc_p.emb_phone.weight"].dtype == torch.half:
565
+ net_g_state["enc_p.emb_phone.weight"] = (
566
+ F.interpolate(
567
+ net_g_state["enc_p.emb_phone.weight"]
568
+ .float()
569
+ .unsqueeze(0)
570
+ .unsqueeze(0),
571
+ size=emb_phone_size,
572
+ mode="bilinear",
573
+ )
574
+ .half()
575
+ .squeeze(0)
576
+ .squeeze(0)
577
+ )
578
+ else:
579
+ net_g_state["enc_p.emb_phone.weight"] = (
580
+ F.interpolate(
581
+ net_g_state["enc_p.emb_phone.weight"]
582
+ .unsqueeze(0)
583
+ .unsqueeze(0),
584
+ size=emb_phone_size,
585
+ mode="bilinear",
586
+ )
587
+ .squeeze(0)
588
+ .squeeze(0)
589
+ )
590
+ print(
591
+ "interpolated pretrained state enc_p.emb_phone from",
592
+ orig_shape,
593
+ "to",
594
+ emb_phone_size,
595
+ )
596
+ if is_multi_process:
597
+ net_g.module.load_state_dict(net_g_state)
598
+ else:
599
+ net_g.load_state_dict(net_g_state)
600
+
601
+ del net_g_state
602
+
603
+ if is_multi_process:
604
+ net_d.module.load_state_dict(
605
+ torch.load(pretrain_d, map_location="cpu")["model"]
606
+ )
607
+ else:
608
+ net_d.load_state_dict(
609
+ torch.load(pretrain_d, map_location="cpu")["model"]
610
+ )
611
+ if is_main_process:
612
+ print(f"loaded pretrained {pretrain_g} {pretrain_d}")
613
+
614
+ else:
615
+ _, _, _, epoch = utils.load_checkpoint(last_d_state, net_d, optim_d)
616
+ _, _, _, epoch = utils.load_checkpoint(last_g_state, net_g, optim_g)
617
+ if is_main_process:
618
+ print(f"loaded last state {last_d_state} {last_g_state}")
619
+
620
+ epoch += 1
621
+ global_step = (epoch - 1) * len(train_loader)
622
+
623
+ if augment:
624
+ # load embedder
625
+ embedder_filepath, _, embedder_load_from = get_embedder(embedder_name)
626
+
627
+ if embedder_load_from == "local":
628
+ embedder_filepath = os.path.join(
629
+ MODELS_DIR, "embeddings", embedder_filepath
630
+ )
631
+ embedder, _ = load_embedder(embedder_filepath, device)
632
+ if not config.train.fp16_run:
633
+ embedder = embedder.float()
634
+
635
+ if (augment_path is not None):
636
+ state_dict = torch.load(augment_path, map_location="cpu")
637
+ if state_dict["f0"] == 1:
638
+ augment_net_g = SynthesizerTrnMs256NSFSid(
639
+ **state_dict["params"], is_half=config.train.fp16_run
640
+ )
641
+ augment_speaker_info = np.load(speaker_info_path)
642
+ else:
643
+ augment_net_g = SynthesizerTrnMs256NSFSidNono(
644
+ **state_dict["params"], is_half=config.train.fp16_run
645
+ )
646
+
647
+ augment_net_g.load_state_dict(state_dict["weight"], strict=False)
648
+ augment_net_g.eval().to(device)
649
+
650
+ else:
651
+ augment_net_g = net_g
652
+ if f0:
653
+ medians = [[] for _ in range(augment_net_g.spk_embed_dim)]
654
+ for file in training_meta.files.values():
655
+ f0f = np.load(file.f0nsf)
656
+ if np.any(f0f > 0):
657
+ medians[file.speaker_id].append(np.median(f0f[f0f > 0]))
658
+ augment_speaker_info = np.array([np.median(x) if len(x) else 0. for x in medians])
659
+ np.save(os.path.join(training_dir, "speaker_info.npy"), augment_speaker_info)
660
+
661
+ if is_multi_process:
662
+ net_g = DDP(net_g, device_ids=[rank])
663
+ net_d = DDP(net_d, device_ids=[rank])
664
+
665
+ scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
666
+ optim_g, gamma=config.train.lr_decay, last_epoch=epoch - 2
667
+ )
668
+ scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
669
+ optim_d, gamma=config.train.lr_decay, last_epoch=epoch - 2
670
+ )
671
+
672
+ scaler = GradScaler(enabled=config.train.fp16_run)
673
+
674
+ cache = []
675
+ progress_bar = tqdm.tqdm(range((total_epoch - epoch + 1) * len(train_loader)))
676
+ progress_bar.set_postfix(epoch=epoch)
677
+ step = -1 + len(train_loader) * (epoch - 1)
678
+ for epoch in range(epoch, total_epoch + 1):
679
+ train_loader.batch_sampler.set_epoch(epoch)
680
+
681
+ net_g.train()
682
+ net_d.train()
683
+
684
+ use_cache = len(cache) == len(train_loader)
685
+ data = cache if use_cache else enumerate(train_loader)
686
+
687
+ if is_main_process:
688
+ lr = optim_g.param_groups[0]["lr"]
689
+
690
+ if use_cache:
691
+ shuffle(cache)
692
+
693
+ for batch_idx, batch in data:
694
+ step += 1
695
+ progress_bar.update(1)
696
+ if f0:
697
+ (
698
+ phone,
699
+ phone_lengths,
700
+ pitch,
701
+ pitchf,
702
+ spec,
703
+ spec_lengths,
704
+ wave,
705
+ wave_lengths,
706
+ sid,
707
+ ) = batch
708
+ else:
709
+ (
710
+ phone,
711
+ phone_lengths,
712
+ spec,
713
+ spec_lengths,
714
+ wave,
715
+ wave_lengths,
716
+ sid,
717
+ ) = batch
718
+
719
+ if not use_cache:
720
+ phone, phone_lengths = (
721
+ phone.to(device=device, non_blocking=True),
722
+ phone_lengths.to(device=device, non_blocking=True),
723
+ )
724
+ if f0:
725
+ pitch, pitchf = (
726
+ pitch.to(device=device, non_blocking=True),
727
+ pitchf.to(device=device, non_blocking=True),
728
+ )
729
+ sid = sid.to(device=device, non_blocking=True)
730
+ spec, spec_lengths = (
731
+ spec.to(device=device, non_blocking=True),
732
+ spec_lengths.to(device=device, non_blocking=True),
733
+ )
734
+ wave, wave_lengths = (
735
+ wave.to(device=device, non_blocking=True),
736
+ wave_lengths.to(device=device, non_blocking=True),
737
+ )
738
+ if cache_in_gpu:
739
+ if f0:
740
+ cache.append(
741
+ (
742
+ batch_idx,
743
+ (
744
+ phone,
745
+ phone_lengths,
746
+ pitch,
747
+ pitchf,
748
+ spec,
749
+ spec_lengths,
750
+ wave,
751
+ wave_lengths,
752
+ sid,
753
+ ),
754
+ )
755
+ )
756
+ else:
757
+ cache.append(
758
+ (
759
+ batch_idx,
760
+ (
761
+ phone,
762
+ phone_lengths,
763
+ spec,
764
+ spec_lengths,
765
+ wave,
766
+ wave_lengths,
767
+ sid,
768
+ ),
769
+ )
770
+ )
771
+
772
+ with autocast(enabled=config.train.fp16_run):
773
+ if augment:
774
+ with torch.no_grad():
775
+ if type(augment_net_g) == SynthesizerTrnMs256NSFSid:
776
+ new_phone, aug_wave = change_speaker(augment_net_g, augment_speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths)
777
+ else:
778
+ new_phone, aug_wave = change_speaker_nono(augment_net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths)
779
+ weight = np.power(.5, step / len(train_loader)) # 学習の初期はそのままのphone embeddingを使う
780
+ phone = phone * weight + new_phone * (1. - weight)
781
+
782
+ if f0:
783
+ (
784
+ y_hat,
785
+ ids_slice,
786
+ x_mask,
787
+ z_mask,
788
+ (z, z_p, m_p, logs_p, m_q, logs_q),
789
+ ) = net_g(
790
+ phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
791
+ )
792
+ else:
793
+ (
794
+ y_hat,
795
+ ids_slice,
796
+ x_mask,
797
+ z_mask,
798
+ (z, z_p, m_p, logs_p, m_q, logs_q),
799
+ ) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
800
+ mel = spec_to_mel_torch(
801
+ spec,
802
+ config.data.filter_length,
803
+ config.data.n_mel_channels,
804
+ config.data.sampling_rate,
805
+ config.data.mel_fmin,
806
+ config.data.mel_fmax,
807
+ )
808
+ y_mel = commons.slice_segments(
809
+ mel, ids_slice, config.train.segment_size // config.data.hop_length
810
+ )
811
+ with autocast(enabled=False):
812
+ y_hat_mel = mel_spectrogram_torch(
813
+ y_hat.float().squeeze(1),
814
+ config.data.filter_length,
815
+ config.data.n_mel_channels,
816
+ config.data.sampling_rate,
817
+ config.data.hop_length,
818
+ config.data.win_length,
819
+ config.data.mel_fmin,
820
+ config.data.mel_fmax,
821
+ )
822
+ if config.train.fp16_run == True and device != torch.device("mps"):
823
+ y_hat_mel = y_hat_mel.half()
824
+ wave_slice = commons.slice_segments(
825
+ wave, ids_slice * config.data.hop_length, config.train.segment_size
826
+ ) # slice
827
+
828
+ # Discriminator
829
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(wave_slice, y_hat.detach())
830
+ with autocast(enabled=False):
831
+ loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
832
+ y_d_hat_r, y_d_hat_g
833
+ )
834
+ optim_d.zero_grad()
835
+ scaler.scale(loss_disc).backward()
836
+ scaler.unscale_(optim_d)
837
+ grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
838
+ scaler.step(optim_d)
839
+
840
+ with autocast(enabled=config.train.fp16_run):
841
+ # Generator
842
+ y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave_slice, y_hat)
843
+ with autocast(enabled=False):
844
+ loss_mel = F.l1_loss(y_mel, y_hat_mel) * config.train.c_mel
845
+ loss_kl = (
846
+ kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl
847
+ )
848
+ loss_fm = feature_loss(fmap_r, fmap_g)
849
+ loss_gen, losses_gen = generator_loss(y_d_hat_g)
850
+ loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
851
+ optim_g.zero_grad()
852
+ scaler.scale(loss_gen_all).backward()
853
+ scaler.unscale_(optim_g)
854
+ grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
855
+ scaler.step(optim_g)
856
+ scaler.update()
857
+
858
+ if is_main_process:
859
+ progress_bar.set_postfix(
860
+ epoch=epoch,
861
+ loss_g=float(loss_gen_all) if loss_gen_all is not None else 0.0,
862
+ loss_d=float(loss_disc) if loss_disc is not None else 0.0,
863
+ lr=float(lr) if lr is not None else 0.0,
864
+ use_cache=use_cache,
865
+ )
866
+ if global_step % config.train.log_interval == 0:
867
+ lr = optim_g.param_groups[0]["lr"]
868
+ # Amor For Tensorboard display
869
+ if loss_mel > 50:
870
+ loss_mel = 50
871
+ if loss_kl > 5:
872
+ loss_kl = 5
873
+
874
+ scalar_dict = {
875
+ "loss/g/total": loss_gen_all,
876
+ "loss/d/total": loss_disc,
877
+ "learning_rate": lr,
878
+ "grad_norm_d": grad_norm_d,
879
+ "grad_norm_g": grad_norm_g,
880
+ }
881
+ scalar_dict.update(
882
+ {
883
+ "loss/g/fm": loss_fm,
884
+ "loss/g/mel": loss_mel,
885
+ "loss/g/kl": loss_kl,
886
+ }
887
+ )
888
+
889
+ scalar_dict.update(
890
+ {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
891
+ )
892
+ scalar_dict.update(
893
+ {
894
+ "loss/d_r/{}".format(i): v
895
+ for i, v in enumerate(losses_disc_r)
896
+ }
897
+ )
898
+ scalar_dict.update(
899
+ {
900
+ "loss/d_g/{}".format(i): v
901
+ for i, v in enumerate(losses_disc_g)
902
+ }
903
+ )
904
+ image_dict = {
905
+ "slice/mel_org": utils.plot_spectrogram_to_numpy(
906
+ y_mel[0].data.cpu().numpy()
907
+ ),
908
+ "slice/mel_gen": utils.plot_spectrogram_to_numpy(
909
+ y_hat_mel[0].data.cpu().numpy()
910
+ ),
911
+ "all/mel": utils.plot_spectrogram_to_numpy(
912
+ mel[0].data.cpu().numpy()
913
+ ),
914
+ }
915
+ utils.summarize(
916
+ writer=writer,
917
+ global_step=global_step,
918
+ images=image_dict,
919
+ scalars=scalar_dict,
920
+ )
921
+ global_step += 1
922
+ if is_main_process and save_every_epoch != 0 and epoch % save_every_epoch == 0:
923
+ if save_only_last:
924
+ old_g_path = os.path.join(
925
+ state_dir, f"G_{epoch - save_every_epoch}.pth"
926
+ )
927
+ old_d_path = os.path.join(
928
+ state_dir, f"D_{epoch - save_every_epoch}.pth"
929
+ )
930
+ old_wav_path = os.path.join(
931
+ state_dir, f"wav_sample_{epoch - save_every_epoch}"
932
+ )
933
+ if os.path.exists(old_g_path):
934
+ os.remove(old_g_path)
935
+ if os.path.exists(old_d_path):
936
+ os.remove(old_d_path)
937
+ if os.path.exists(old_wav_path):
938
+ shutil.rmtree(old_wav_path)
939
+
940
+ if save_wav_with_checkpoint:
941
+ with autocast(enabled=config.train.fp16_run):
942
+ with torch.no_grad():
943
+ if f0:
944
+ pred_wave = net_g.infer(phone, phone_lengths, pitch, pitchf, sid)[0]
945
+ else:
946
+ pred_wave = net_g.infer(phone, phone_lengths, sid)[0]
947
+ os.makedirs(os.path.join(state_dir, f"wav_sample_{epoch}"), exist_ok=True)
948
+ for i in range(pred_wave.shape[0]):
949
+ torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_true.wav"), src=wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000"))
950
+ torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_pred.wav"), src=pred_wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000"))
951
+ if augment:
952
+ torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_aug.wav"), src=aug_wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000"))
953
+
954
+ utils.save_state(
955
+ net_g,
956
+ optim_g,
957
+ config.train.learning_rate,
958
+ epoch,
959
+ os.path.join(state_dir, f"G_{epoch}.pth"),
960
+ )
961
+ utils.save_state(
962
+ net_d,
963
+ optim_d,
964
+ config.train.learning_rate,
965
+ epoch,
966
+ os.path.join(state_dir, f"D_{epoch}.pth"),
967
+ )
968
+
969
+ save(
970
+ net_g,
971
+ config.version,
972
+ sample_rate,
973
+ f0,
974
+ embedder_name,
975
+ embedder_out_channels,
976
+ embedding_output_layer,
977
+ os.path.join(training_dir, "checkpoints", f"{model_name}-{epoch}.pth"),
978
+ epoch,
979
+ speaker_info
980
+ )
981
+
982
+ scheduler_g.step()
983
+ scheduler_d.step()
984
+
985
+ if is_main_process:
986
+ print("Training is done. The program is closed.")
987
+ save(
988
+ net_g,
989
+ config.version,
990
+ sample_rate,
991
+ f0,
992
+ embedder_name,
993
+ embedder_out_channels,
994
+ embedding_output_layer,
995
+ os.path.join(out_dir, f"{model_name}.pth"),
996
+ epoch,
997
+ speaker_info
998
+ )
lib/rvc/transforms.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
6
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
7
+ DEFAULT_MIN_DERIVATIVE = 1e-3
8
+
9
+
10
+ def piecewise_rational_quadratic_transform(
11
+ inputs,
12
+ unnormalized_widths,
13
+ unnormalized_heights,
14
+ unnormalized_derivatives,
15
+ inverse=False,
16
+ tails=None,
17
+ tail_bound=1.0,
18
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
19
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
20
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
21
+ ):
22
+ if tails is None:
23
+ spline_fn = rational_quadratic_spline
24
+ spline_kwargs = {}
25
+ else:
26
+ spline_fn = unconstrained_rational_quadratic_spline
27
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
28
+
29
+ outputs, logabsdet = spline_fn(
30
+ inputs=inputs,
31
+ unnormalized_widths=unnormalized_widths,
32
+ unnormalized_heights=unnormalized_heights,
33
+ unnormalized_derivatives=unnormalized_derivatives,
34
+ inverse=inverse,
35
+ min_bin_width=min_bin_width,
36
+ min_bin_height=min_bin_height,
37
+ min_derivative=min_derivative,
38
+ **spline_kwargs
39
+ )
40
+ return outputs, logabsdet
41
+
42
+
43
+ def searchsorted(bin_locations, inputs, eps=1e-6):
44
+ bin_locations[..., -1] += eps
45
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
46
+
47
+
48
+ def unconstrained_rational_quadratic_spline(
49
+ inputs,
50
+ unnormalized_widths,
51
+ unnormalized_heights,
52
+ unnormalized_derivatives,
53
+ inverse=False,
54
+ tails="linear",
55
+ tail_bound=1.0,
56
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
57
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
58
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
59
+ ):
60
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
61
+ outside_interval_mask = ~inside_interval_mask
62
+
63
+ outputs = torch.zeros_like(inputs)
64
+ logabsdet = torch.zeros_like(inputs)
65
+
66
+ if tails == "linear":
67
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
68
+ constant = np.log(np.exp(1 - min_derivative) - 1)
69
+ unnormalized_derivatives[..., 0] = constant
70
+ unnormalized_derivatives[..., -1] = constant
71
+
72
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
73
+ logabsdet[outside_interval_mask] = 0
74
+ else:
75
+ raise RuntimeError("{} tails are not implemented.".format(tails))
76
+
77
+ (
78
+ outputs[inside_interval_mask],
79
+ logabsdet[inside_interval_mask],
80
+ ) = rational_quadratic_spline(
81
+ inputs=inputs[inside_interval_mask],
82
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
83
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
84
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
85
+ inverse=inverse,
86
+ left=-tail_bound,
87
+ right=tail_bound,
88
+ bottom=-tail_bound,
89
+ top=tail_bound,
90
+ min_bin_width=min_bin_width,
91
+ min_bin_height=min_bin_height,
92
+ min_derivative=min_derivative,
93
+ )
94
+
95
+ return outputs, logabsdet
96
+
97
+
98
+ def rational_quadratic_spline(
99
+ inputs,
100
+ unnormalized_widths,
101
+ unnormalized_heights,
102
+ unnormalized_derivatives,
103
+ inverse=False,
104
+ left=0.0,
105
+ right=1.0,
106
+ bottom=0.0,
107
+ top=1.0,
108
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
109
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
110
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
111
+ ):
112
+ if torch.min(inputs) < left or torch.max(inputs) > right:
113
+ raise ValueError("Input to a transform is not within its domain")
114
+
115
+ num_bins = unnormalized_widths.shape[-1]
116
+
117
+ if min_bin_width * num_bins > 1.0:
118
+ raise ValueError("Minimal bin width too large for the number of bins")
119
+ if min_bin_height * num_bins > 1.0:
120
+ raise ValueError("Minimal bin height too large for the number of bins")
121
+
122
+ widths = F.softmax(unnormalized_widths, dim=-1)
123
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
124
+ cumwidths = torch.cumsum(widths, dim=-1)
125
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
126
+ cumwidths = (right - left) * cumwidths + left
127
+ cumwidths[..., 0] = left
128
+ cumwidths[..., -1] = right
129
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
130
+
131
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
132
+
133
+ heights = F.softmax(unnormalized_heights, dim=-1)
134
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
135
+ cumheights = torch.cumsum(heights, dim=-1)
136
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
137
+ cumheights = (top - bottom) * cumheights + bottom
138
+ cumheights[..., 0] = bottom
139
+ cumheights[..., -1] = top
140
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
141
+
142
+ if inverse:
143
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
144
+ else:
145
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
146
+
147
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
148
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
149
+
150
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
151
+ delta = heights / widths
152
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
153
+
154
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
155
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
156
+
157
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
158
+
159
+ if inverse:
160
+ a = (inputs - input_cumheights) * (
161
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
162
+ ) + input_heights * (input_delta - input_derivatives)
163
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
164
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
165
+ )
166
+ c = -input_delta * (inputs - input_cumheights)
167
+
168
+ discriminant = b.pow(2) - 4 * a * c
169
+ assert (discriminant >= 0).all()
170
+
171
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
172
+ outputs = root * input_bin_widths + input_cumwidths
173
+
174
+ theta_one_minus_theta = root * (1 - root)
175
+ denominator = input_delta + (
176
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177
+ * theta_one_minus_theta
178
+ )
179
+ derivative_numerator = input_delta.pow(2) * (
180
+ input_derivatives_plus_one * root.pow(2)
181
+ + 2 * input_delta * theta_one_minus_theta
182
+ + input_derivatives * (1 - root).pow(2)
183
+ )
184
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
185
+
186
+ return outputs, -logabsdet
187
+ else:
188
+ theta = (inputs - input_cumwidths) / input_bin_widths
189
+ theta_one_minus_theta = theta * (1 - theta)
190
+
191
+ numerator = input_heights * (
192
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
193
+ )
194
+ denominator = input_delta + (
195
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
196
+ * theta_one_minus_theta
197
+ )
198
+ outputs = input_cumheights + numerator / denominator
199
+
200
+ derivative_numerator = input_delta.pow(2) * (
201
+ input_derivatives_plus_one * theta.pow(2)
202
+ + 2 * input_delta * theta_one_minus_theta
203
+ + input_derivatives * (1 - theta).pow(2)
204
+ )
205
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
206
+
207
+ return outputs, logabsdet
lib/rvc/utils.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import socket
6
+ import sys
7
+
8
+ import ffmpeg
9
+ import matplotlib
10
+ import matplotlib.pylab as plt
11
+ import numpy as np
12
+ import torch
13
+ from scipy.io.wavfile import read
14
+ from torch.nn import functional as F
15
+
16
+ from modules.shared import ROOT_DIR
17
+
18
+ from .config import TrainConfig
19
+
20
+ matplotlib.use("Agg")
21
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
22
+
23
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
24
+ logger = logging
25
+
26
+
27
+ def load_audio(file: str, sr):
28
+ try:
29
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
30
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
31
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
32
+ file = (
33
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
34
+ ) # Prevent small white copy path head and tail with spaces and " and return
35
+ out, _ = (
36
+ ffmpeg.input(file, threads=0)
37
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
38
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
39
+ )
40
+ except Exception as e:
41
+ raise RuntimeError(f"Failed to load audio: {e}")
42
+
43
+ return np.frombuffer(out, np.float32).flatten()
44
+
45
+
46
+ def find_empty_port():
47
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
48
+ s.bind(("", 0))
49
+ s.listen(1)
50
+ port = s.getsockname()[1]
51
+ s.close()
52
+ return port
53
+
54
+
55
+ def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
56
+ assert os.path.isfile(checkpoint_path)
57
+ checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
58
+
59
+ saved_state_dict = checkpoint_dict["model"]
60
+ if hasattr(model, "module"):
61
+ state_dict = model.module.state_dict()
62
+ else:
63
+ state_dict = model.state_dict()
64
+ new_state_dict = {}
65
+ for k, v in state_dict.items(): # 模型需要的shape
66
+ try:
67
+ new_state_dict[k] = saved_state_dict[k]
68
+ if saved_state_dict[k].shape != state_dict[k].shape:
69
+ print(
70
+ f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}"
71
+ )
72
+ if saved_state_dict[k].dim() == 2: # NOTE: check is this ok?
73
+ # for embedded input 256 <==> 768
74
+ # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc.
75
+ if saved_state_dict[k].dtype == torch.half:
76
+ new_state_dict[k] = (
77
+ F.interpolate(
78
+ saved_state_dict[k].float().unsqueeze(0).unsqueeze(0),
79
+ size=state_dict[k].shape,
80
+ mode="bilinear",
81
+ )
82
+ .half()
83
+ .squeeze(0)
84
+ .squeeze(0)
85
+ )
86
+ else:
87
+ new_state_dict[k] = (
88
+ F.interpolate(
89
+ saved_state_dict[k].unsqueeze(0).unsqueeze(0),
90
+ size=state_dict[k].shape,
91
+ mode="bilinear",
92
+ )
93
+ .squeeze(0)
94
+ .squeeze(0)
95
+ )
96
+ print(
97
+ "interpolated new_state_dict",
98
+ k,
99
+ "from",
100
+ saved_state_dict[k].shape,
101
+ "to",
102
+ new_state_dict[k].shape,
103
+ )
104
+ else:
105
+ raise KeyError
106
+ except Exception as e:
107
+ # print(traceback.format_exc())
108
+ print(f"{k} is not in the checkpoint")
109
+ print("error: %s" % e)
110
+ new_state_dict[k] = v # 模型自带的随机值
111
+ if hasattr(model, "module"):
112
+ model.module.load_state_dict(new_state_dict, strict=False)
113
+ else:
114
+ model.load_state_dict(new_state_dict, strict=False)
115
+ print("Loaded model weights")
116
+
117
+ epoch = checkpoint_dict["epoch"]
118
+ learning_rate = checkpoint_dict["learning_rate"]
119
+ if optimizer is not None and load_opt == 1:
120
+ optimizer.load_state_dict(checkpoint_dict["optimizer"])
121
+ print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch))
122
+ return model, optimizer, learning_rate, epoch
123
+
124
+
125
+ def save_state(model, optimizer, learning_rate, epoch, checkpoint_path):
126
+ print(
127
+ "Saving model and optimizer state at epoch {} to {}".format(
128
+ epoch, checkpoint_path
129
+ )
130
+ )
131
+ if hasattr(model, "module"):
132
+ state_dict = model.module.state_dict()
133
+ else:
134
+ state_dict = model.state_dict()
135
+ torch.save(
136
+ {
137
+ "model": state_dict,
138
+ "epoch": epoch,
139
+ "optimizer": optimizer.state_dict(),
140
+ "learning_rate": learning_rate,
141
+ },
142
+ checkpoint_path,
143
+ )
144
+
145
+
146
+ def summarize(
147
+ writer,
148
+ global_step,
149
+ scalars={},
150
+ histograms={},
151
+ images={},
152
+ audios={},
153
+ audio_sampling_rate=22050,
154
+ ):
155
+ for k, v in scalars.items():
156
+ writer.add_scalar(k, v, global_step)
157
+ for k, v in histograms.items():
158
+ writer.add_histogram(k, v, global_step)
159
+ for k, v in images.items():
160
+ writer.add_image(k, v, global_step, dataformats="HWC")
161
+ for k, v in audios.items():
162
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
163
+
164
+
165
+ def latest_checkpoint_path(dir_path, regex="G_*.pth"):
166
+ filelist = glob.glob(os.path.join(dir_path, regex))
167
+ if len(filelist) == 0:
168
+ return None
169
+ filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
170
+ filepath = filelist[-1]
171
+ return filepath
172
+
173
+
174
+ def plot_spectrogram_to_numpy(spectrogram):
175
+ fig, ax = plt.subplots(figsize=(10, 2))
176
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
177
+ plt.colorbar(im, ax=ax)
178
+ plt.xlabel("Frames")
179
+ plt.ylabel("Channels")
180
+ plt.tight_layout()
181
+
182
+ fig.canvas.draw()
183
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
184
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
185
+ plt.close()
186
+ return data
187
+
188
+
189
+ def plot_alignment_to_numpy(alignment, info=None):
190
+ fig, ax = plt.subplots(figsize=(6, 4))
191
+ im = ax.imshow(
192
+ alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
193
+ )
194
+ fig.colorbar(im, ax=ax)
195
+ xlabel = "Decoder timestep"
196
+ if info is not None:
197
+ xlabel += "\n\n" + info
198
+ plt.xlabel(xlabel)
199
+ plt.ylabel("Encoder timestep")
200
+ plt.tight_layout()
201
+
202
+ fig.canvas.draw()
203
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
204
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
205
+ plt.close()
206
+ return data
207
+
208
+
209
+ def load_wav_to_torch(full_path):
210
+ sampling_rate, data = read(full_path)
211
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
212
+
213
+
214
+ def load_config(training_dir: str, sample_rate: int, emb_channels: int):
215
+ if emb_channels == 256:
216
+ config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json")
217
+ else:
218
+ config_path = os.path.join(
219
+ ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json"
220
+ )
221
+ config_save_path = os.path.join(training_dir, "config.json")
222
+
223
+ shutil.copyfile(config_path, config_save_path)
224
+
225
+ return TrainConfig.parse_file(config_save_path)
models/checkpoints/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
models/embeddings/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
models/pretrained/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
models/training/.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ */**
2
+
3
+ !mute/**/*
4
+ !.gitignore
5
+
6
+ mute/**/*.pt
models/training/models/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
models/training/mute/0_gt_wavs/mute32k.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9edcf85ec77e88bd01edf3d887bdc418d3596d573f7ad2694da546f41dae6baf
3
+ size 192078
models/training/mute/0_gt_wavs/mute40k.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67a816e77b50cb9f016e49e5c01f07e080c4e3b82b7a8ac3e64bcb143f90f31b
3
+ size 240078
models/training/mute/0_gt_wavs/mute48k.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2bb4daaa106e351aebb001e5a25de985c0b472f22e8d60676bc924a79056ee
3
+ size 288078
models/training/mute/1_16k_wavs/mute.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e233e86ba1be365e1133f157d56b61110086b89650ecfbdfc013c759e466250
3
+ size 96078
models/training/mute/2a_f0/mute.wav.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b9acf9ab7facdb032e1d687fe35182670b0b94566c4b209ae48c239d19956a6
3
+ size 1332
models/training/mute/2b_f0nsf/mute.wav.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30792849c8e72d67e6691754077f2888b101cb741e9c7f193c91dd9692870c87
3
+ size 2536
models/training/mute/3_feature256/mute.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64d5abbac078e19a3f649c0d78a02cb33a71407ded3ddf2db78e6b803d0c0126
3
+ size 152704
modules/cmd_opts.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ parser = argparse.ArgumentParser()
4
+
5
+ parser.add_argument("--host", help="Host to connect to", type=str, default="127.0.0.1")
6
+ parser.add_argument("--port", help="Port to connect to", type=int)
7
+ parser.add_argument("--share", help="Enable gradio share", action="store_true")
8
+ parser.add_argument(
9
+ "--models-dir", help="Path to models directory", type=str, default=None
10
+ )
11
+ parser.add_argument(
12
+ "--output-dir", help="Path to output directory", type=str, default=None
13
+ )
14
+ parser.add_argument(
15
+ "--precision",
16
+ help="Precision to use",
17
+ type=str,
18
+ default="fp16",
19
+ choices=["fp32", "fp16"],
20
+ )
21
+
22
+ opts, _ = parser.parse_known_args()
modules/core.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import shutil
4
+ import sys
5
+ from concurrent.futures import ThreadPoolExecutor
6
+
7
+ import requests
8
+
9
+ from modules.models import MODELS_DIR
10
+ from modules.shared import ROOT_DIR
11
+ from modules.utils import download_file
12
+
13
+
14
+ def get_hf_etag(url: str):
15
+ r = requests.head(url)
16
+
17
+ etag = r.headers["X-Linked-ETag"] if "X-Linked-ETag" in r.headers else ""
18
+
19
+ if etag.startswith('"') and etag.endswith('"'):
20
+ etag = etag[1:-1]
21
+
22
+ return etag
23
+
24
+
25
+ def calc_sha256(filepath: str):
26
+ sha256 = hashlib.sha256()
27
+ with open(filepath, "rb") as f:
28
+ for chunk in iter(lambda: f.read(4096), b""):
29
+ sha256.update(chunk)
30
+ return sha256.hexdigest()
31
+
32
+
33
+ def download_models():
34
+ def hash_check(url: str, out: str):
35
+ if not os.path.exists(out):
36
+ return False
37
+ etag = get_hf_etag(url)
38
+ hash = calc_sha256(out)
39
+ return etag == hash
40
+
41
+ os.makedirs(os.path.join(MODELS_DIR, "pretrained", "v2"), exist_ok=True)
42
+
43
+ tasks = []
44
+ for template in [
45
+ "D{}k",
46
+ "G{}k",
47
+ "f0D{}k",
48
+ "f0G{}k",
49
+ ]:
50
+ basename = template.format("40")
51
+ url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/pretrained/v2/{basename}.pth"
52
+ out = os.path.join(MODELS_DIR, "pretrained", "v2", f"{basename}.pth")
53
+
54
+ if hash_check(url, out):
55
+ continue
56
+
57
+ tasks.append((url, out))
58
+
59
+ for filename in [
60
+ "checkpoint_best_legacy_500.pt",
61
+ ]:
62
+ out = os.path.join(MODELS_DIR, "embeddings", filename)
63
+ url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}"
64
+
65
+ if hash_check(url, out):
66
+ continue
67
+
68
+ tasks.append(
69
+ (
70
+ f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}",
71
+ out,
72
+ )
73
+ )
74
+
75
+ # japanese-hubert-base (Fairseq)
76
+ # from official repo
77
+ # NOTE: change filename?
78
+ hubert_jp_url = f"https://huggingface.co/rinna/japanese-hubert-base/resolve/main/fairseq/model.pt"
79
+ out = os.path.join(MODELS_DIR, "embeddings", "rinna_hubert_base_jp.pt")
80
+ if not hash_check(hubert_jp_url, out):
81
+ tasks.append(
82
+ (
83
+ hubert_jp_url,
84
+ out,
85
+ )
86
+ )
87
+
88
+ if len(tasks) < 1:
89
+ return
90
+
91
+ with ThreadPoolExecutor() as pool:
92
+ pool.map(
93
+ download_file,
94
+ *zip(
95
+ *[(filename, out, i, True) for i, (filename, out) in enumerate(tasks)]
96
+ ),
97
+ )
98
+
99
+
100
+ def install_ffmpeg():
101
+ if os.path.exists(os.path.join(ROOT_DIR, "bin", "ffmpeg.exe")):
102
+ return
103
+ tmpdir = os.path.join(ROOT_DIR, "tmp")
104
+ url = (
105
+ "https://www.gyan.dev/ffmpeg/builds/packages/ffmpeg-5.1.2-essentials_build.zip"
106
+ )
107
+ out = os.path.join(tmpdir, "ffmpeg.zip")
108
+ os.makedirs(os.path.dirname(out), exist_ok=True)
109
+ download_file(url, out)
110
+ shutil.unpack_archive(out, os.path.join(tmpdir, "ffmpeg"))
111
+ shutil.copyfile(
112
+ os.path.join(
113
+ tmpdir, "ffmpeg", "ffmpeg-5.1.2-essentials_build", "bin", "ffmpeg.exe"
114
+ ),
115
+ os.path.join(ROOT_DIR, "bin", "ffmpeg.exe"),
116
+ )
117
+ os.remove(os.path.join(tmpdir, "ffmpeg.zip"))
118
+ shutil.rmtree(os.path.join(tmpdir, "ffmpeg"))
119
+
120
+
121
+ def update_modelnames():
122
+ for sr in ["32k", "40k", "48k"]:
123
+ files = [
124
+ f"f0G{sr}",
125
+ f"f0D{sr}",
126
+ f"G{sr}",
127
+ f"D{sr}",
128
+ ]
129
+ for file in files:
130
+ filepath = os.path.join(MODELS_DIR, "pretrained", f"{file}.pth")
131
+ if os.path.exists(filepath):
132
+ os.rename(
133
+ filepath,
134
+ os.path.join(MODELS_DIR, "pretrained", f"{file}256.pth"),
135
+ )
136
+
137
+ if not os.path.exists(os.path.join(MODELS_DIR, "embeddings")):
138
+ os.makedirs(os.path.join(MODELS_DIR, "embeddings"))
139
+
140
+ if os.path.exists(os.path.join(MODELS_DIR, "hubert_base.pt")):
141
+ os.rename(
142
+ os.path.join(MODELS_DIR, "hubert_base.pt"),
143
+ os.path.join(MODELS_DIR, "embeddings", "hubert_base.pt"),
144
+ )
145
+ if os.path.exists(os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt")):
146
+ os.rename(
147
+ os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt"),
148
+ os.path.join(MODELS_DIR, "embeddings", "checkpoint_best_legacy_500.pt"),
149
+ )
150
+
151
+
152
+ def preload():
153
+ update_modelnames()
154
+ download_models()
155
+ if sys.platform == "win32":
156
+ install_ffmpeg()
modules/merge.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import *
3
+
4
+ import torch
5
+ import tqdm
6
+
7
+
8
+ def merge(
9
+ path_a: str,
10
+ path_b: str,
11
+ path_c: str,
12
+ alpha: float,
13
+ weights: Dict[str, float],
14
+ method: str,
15
+ ):
16
+ def extract(ckpt: Dict[str, Any]):
17
+ a = ckpt["model"]
18
+ opt = OrderedDict()
19
+ opt["weight"] = {}
20
+ for key in a.keys():
21
+ if "enc_q" in key:
22
+ continue
23
+ opt["weight"][key] = a[key]
24
+ return opt
25
+
26
+ def load_weight(path: str):
27
+ print(f"Loading {path}...")
28
+ state_dict = torch.load(path, map_location="cpu")
29
+ if "model" in state_dict:
30
+ weight = extract(state_dict)
31
+ else:
32
+ weight = state_dict["weight"]
33
+ return weight, state_dict
34
+
35
+ def get_alpha(key: str):
36
+ try:
37
+ filtered = sorted(
38
+ [x for x in weights.keys() if key.startswith(x)], key=len, reverse=True
39
+ )
40
+ if len(filtered) < 1:
41
+ return alpha
42
+ return weights[filtered[0]]
43
+ except:
44
+ return alpha
45
+
46
+ weight_a, state_dict = load_weight(path_a)
47
+ weight_b, _ = load_weight(path_b)
48
+ if path_c is not None:
49
+ weight_c, _ = load_weight(path_c)
50
+
51
+ if sorted(list(weight_a.keys())) != sorted(list(weight_b.keys())):
52
+ raise RuntimeError("Failed to merge models.")
53
+
54
+ merged = OrderedDict()
55
+ merged["weight"] = {}
56
+
57
+ def merge_weight(a, b, c, alpha):
58
+ if method == "weight_sum":
59
+ return (1 - alpha) * a + alpha * b
60
+ elif method == "add_diff":
61
+ return a + (b - c) * alpha
62
+
63
+ for key in tqdm.tqdm(weight_a.keys()):
64
+ a = get_alpha(key)
65
+ if path_c is not None:
66
+ merged["weight"][key] = merge_weight(
67
+ weight_a[key], weight_b[key], weight_c[key], a
68
+ )
69
+ else:
70
+ merged["weight"][key] = merge_weight(weight_a[key], weight_b[key], None, a)
71
+ merged["config"] = state_dict["config"]
72
+ merged["params"] = state_dict["params"] if "params" in state_dict else None
73
+ merged["version"] = state_dict.get("version", "v1")
74
+ merged["sr"] = state_dict["sr"]
75
+ merged["f0"] = state_dict["f0"]
76
+ merged["info"] = state_dict["info"]
77
+ merged["embedder_name"] = (
78
+ state_dict["embedder_name"] if "embedder_name" in state_dict else None
79
+ )
80
+ merged["embedder_output_layer"] = state_dict.get("embedder_output_layer", "12")
81
+ return merged
modules/models.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from typing import *
4
+
5
+ import torch
6
+ from fairseq import checkpoint_utils
7
+ from fairseq.models.hubert.hubert import HubertModel
8
+ from pydub import AudioSegment
9
+
10
+ from lib.rvc.models import (SynthesizerTrnMs256NSFSid,
11
+ SynthesizerTrnMs256NSFSidNono)
12
+ from lib.rvc.pipeline import VocalConvertPipeline
13
+
14
+ from .cmd_opts import opts
15
+ from .shared import ROOT_DIR, device, is_half
16
+ from .utils import load_audio
17
+
18
+ AUDIO_OUT_DIR = opts.output_dir or os.path.join(ROOT_DIR, "outputs")
19
+
20
+
21
+ EMBEDDINGS_LIST = {
22
+ "hubert-base-japanese": (
23
+ "rinna_hubert_base_jp.pt",
24
+ "hubert-base-japanese",
25
+ "local",
26
+ ),
27
+ "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
28
+ }
29
+
30
+
31
+ def update_state_dict(state_dict):
32
+ if "params" in state_dict and state_dict["params"] is not None:
33
+ return
34
+ keys = [
35
+ "spec_channels",
36
+ "segment_size",
37
+ "inter_channels",
38
+ "hidden_channels",
39
+ "filter_channels",
40
+ "n_heads",
41
+ "n_layers",
42
+ "kernel_size",
43
+ "p_dropout",
44
+ "resblock",
45
+ "resblock_kernel_sizes",
46
+ "resblock_dilation_sizes",
47
+ "upsample_rates",
48
+ "upsample_initial_channel",
49
+ "upsample_kernel_sizes",
50
+ "spk_embed_dim",
51
+ "gin_channels",
52
+ "emb_channels",
53
+ "sr",
54
+ ]
55
+ state_dict["params"] = {}
56
+ n = 0
57
+ for i, key in enumerate(keys):
58
+ i = i - n
59
+ if len(state_dict["config"]) != 19 and key == "emb_channels":
60
+ # backward compat.
61
+ n += 1
62
+ continue
63
+ state_dict["params"][key] = state_dict["config"][i]
64
+
65
+ if not "emb_channels" in state_dict["params"]:
66
+ if state_dict.get("version", "v1") == "v1":
67
+ state_dict["params"]["emb_channels"] = 256 # for backward compat.
68
+ state_dict["embedder_output_layer"] = 9
69
+ else:
70
+ state_dict["params"]["emb_channels"] = 768 # for backward compat.
71
+ state_dict["embedder_output_layer"] = 12
72
+
73
+
74
+ class VoiceConvertModel:
75
+ def __init__(self, model_name: str, state_dict: Dict[str, Any]) -> None:
76
+ update_state_dict(state_dict)
77
+ self.model_name = model_name
78
+ self.state_dict = state_dict
79
+ self.tgt_sr = state_dict["params"]["sr"]
80
+ f0 = state_dict.get("f0", 1)
81
+ state_dict["params"]["spk_embed_dim"] = state_dict["weight"][
82
+ "emb_g.weight"
83
+ ].shape[0]
84
+ if not "emb_channels" in state_dict["params"]:
85
+ state_dict["params"]["emb_channels"] = 256 # for backward compat.
86
+
87
+ if f0 == 1:
88
+ self.net_g = SynthesizerTrnMs256NSFSid(
89
+ **state_dict["params"], is_half=is_half
90
+ )
91
+ else:
92
+ self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"])
93
+
94
+ del self.net_g.enc_q
95
+
96
+ self.net_g.load_state_dict(state_dict["weight"], strict=False)
97
+ self.net_g.eval().to(device)
98
+
99
+ if is_half:
100
+ self.net_g = self.net_g.half()
101
+ else:
102
+ self.net_g = self.net_g.float()
103
+
104
+ self.vc = VocalConvertPipeline(self.tgt_sr, device, is_half)
105
+ self.n_spk = state_dict["params"]["spk_embed_dim"]
106
+
107
+ def single(
108
+ self,
109
+ sid: int,
110
+ input_audio: str,
111
+ embedder_model_name: str,
112
+ embedding_output_layer: str,
113
+ f0_up_key: int,
114
+ f0_file: str,
115
+ f0_method: str,
116
+ auto_load_index: bool,
117
+ faiss_index_file: str,
118
+ index_rate: float,
119
+ output_dir: str = AUDIO_OUT_DIR,
120
+ ):
121
+ if not input_audio:
122
+ raise Exception("You need to set Source Audio")
123
+ f0_up_key = int(f0_up_key)
124
+ audio = load_audio(input_audio, 16000)
125
+
126
+ if embedder_model_name == "auto":
127
+ embedder_model_name = (
128
+ self.state_dict["embedder_name"]
129
+ if "embedder_name" in self.state_dict
130
+ else "hubert_base"
131
+ )
132
+ if embedder_model_name.endswith("768"):
133
+ embedder_model_name = embedder_model_name[:-3]
134
+
135
+ if embedder_model_name == "hubert_base":
136
+ embedder_model_name = "contentvec"
137
+
138
+ if not embedder_model_name in EMBEDDINGS_LIST.keys():
139
+ raise Exception(f"Not supported embedder: {embedder_model_name}")
140
+
141
+ if (
142
+ embedder_model == None
143
+ or loaded_embedder_model != EMBEDDINGS_LIST[embedder_model_name][1]
144
+ ):
145
+ print(f"load {embedder_model_name} embedder")
146
+ embedder_filename, embedder_name, load_from = get_embedder(
147
+ embedder_model_name
148
+ )
149
+ load_embedder(embedder_filename, embedder_name)
150
+
151
+ if embedding_output_layer == "auto":
152
+ embedding_output_layer = (
153
+ self.state_dict["embedding_output_layer"]
154
+ if "embedding_output_layer" in self.state_dict
155
+ else 12
156
+ )
157
+ else:
158
+ embedding_output_layer = int(embedding_output_layer)
159
+
160
+ f0 = self.state_dict.get("f0", 1)
161
+
162
+ if not faiss_index_file and auto_load_index:
163
+ faiss_index_file = self.get_index_path(sid)
164
+
165
+ audio_opt = self.vc(
166
+ embedder_model,
167
+ embedding_output_layer,
168
+ self.net_g,
169
+ sid,
170
+ audio,
171
+ f0_up_key,
172
+ f0_method,
173
+ faiss_index_file,
174
+ index_rate,
175
+ f0,
176
+ f0_file=f0_file,
177
+ )
178
+
179
+ audio = AudioSegment(
180
+ audio_opt,
181
+ frame_rate=self.tgt_sr,
182
+ sample_width=2,
183
+ channels=1,
184
+ )
185
+ os.makedirs(output_dir, exist_ok=True)
186
+ input_audio_splitext = os.path.splitext(os.path.basename(input_audio))[0]
187
+ model_splitext = os.path.splitext(self.model_name)[0]
188
+ index = 0
189
+ existing_files = os.listdir(output_dir)
190
+ for existing_file in existing_files:
191
+ result = re.match(r"\d+", existing_file)
192
+ if result:
193
+ prefix_num = int(result.group(0))
194
+ if index < prefix_num:
195
+ index = prefix_num
196
+ audio.export(
197
+ os.path.join(
198
+ output_dir, f"{index+1}-{model_splitext}-{input_audio_splitext}.wav"
199
+ ),
200
+ format="wav",
201
+ )
202
+ return audio_opt
203
+
204
+ def get_index_path(self, speaker_id: int):
205
+ basename = os.path.splitext(self.model_name)[0]
206
+ speaker_index_path = os.path.join(
207
+ MODELS_DIR,
208
+ "checkpoints",
209
+ f"{basename}_index",
210
+ f"{basename}.{speaker_id}.index",
211
+ )
212
+ if os.path.exists(speaker_index_path):
213
+ return speaker_index_path
214
+ return os.path.join(MODELS_DIR, "checkpoints", f"{basename}.index")
215
+
216
+
217
+ MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models")
218
+ vc_model: Optional[VoiceConvertModel] = None
219
+ embedder_model: Optional[HubertModel] = None
220
+ loaded_embedder_model = ""
221
+
222
+
223
+ def get_models():
224
+ dir = os.path.join(ROOT_DIR, "models", "checkpoints")
225
+ os.makedirs(dir, exist_ok=True)
226
+ return [
227
+ file
228
+ for file in os.listdir(dir)
229
+ if any([x for x in [".ckpt", ".pth"] if file.endswith(x)])
230
+ ]
231
+
232
+
233
+ def get_embedder(embedder_name):
234
+ if embedder_name in EMBEDDINGS_LIST:
235
+ return EMBEDDINGS_LIST[embedder_name]
236
+ return None
237
+
238
+
239
+ def load_embedder(emb_file: str, emb_name: str):
240
+ global embedder_model, loaded_embedder_model
241
+ emb_file = os.path.join(MODELS_DIR, "embeddings", emb_file)
242
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
243
+ [emb_file],
244
+ suffix="",
245
+ )
246
+ embedder_model = models[0]
247
+ embedder_model = embedder_model.to(device)
248
+
249
+ if is_half:
250
+ embedder_model = embedder_model.half()
251
+ else:
252
+ embedder_model = embedder_model.float()
253
+ embedder_model.eval()
254
+
255
+ loaded_embedder_model = emb_name
256
+
257
+
258
+ def get_vc_model(model_name: str):
259
+ model_path = os.path.join(MODELS_DIR, "checkpoints", model_name)
260
+ weight = torch.load(model_path, map_location="cpu")
261
+ return VoiceConvertModel(model_name, weight)
262
+
263
+
264
+ def load_model(model_name: str):
265
+ global vc_model
266
+ vc_model = get_vc_model(model_name)
modules/separate.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import *
3
+
4
+ import tqdm
5
+ from pydub import AudioSegment
6
+ from pydub.silence import split_on_silence
7
+
8
+
9
+ def separate_audio(
10
+ input: str,
11
+ output: str,
12
+ silence_thresh: int,
13
+ min_silence_len: int = 1000,
14
+ keep_silence: int = 100,
15
+ margin: int = 0,
16
+ padding: bool = False,
17
+ min: Optional[int] = None,
18
+ max: Optional[int] = None,
19
+ ):
20
+ if os.path.isfile(input):
21
+ input = [input]
22
+ elif os.path.isdir(input):
23
+ input = [os.path.join(input, f) for f in os.listdir(input)]
24
+ else:
25
+ raise ValueError("input must be a file or directory")
26
+
27
+ os.makedirs(output, exist_ok=True)
28
+
29
+ for file in input:
30
+ if os.path.splitext(file)[1] == ".mp3":
31
+ audio = AudioSegment.from_mp3(file)
32
+ elif os.path.splitext(file)[1] == ".wav":
33
+ audio = AudioSegment.from_wav(file)
34
+ elif os.path.splitext(file)[1] == ".flac":
35
+ audio = AudioSegment.from_file(file, "flac")
36
+ else:
37
+ raise ValueError(
38
+ "Invalid file format. Only MP3 and WAV files are supported."
39
+ )
40
+
41
+ chunks = split_on_silence(
42
+ audio,
43
+ min_silence_len=min_silence_len,
44
+ silence_thresh=silence_thresh,
45
+ keep_silence=keep_silence,
46
+ )
47
+
48
+ output_chunks: List[AudioSegment] = []
49
+
50
+ so_short = None
51
+
52
+ for chunk in tqdm.tqdm(chunks):
53
+ if so_short is not None:
54
+ chunk = so_short + chunk
55
+ so_short = None
56
+ if min is None or len(chunk) > min:
57
+ if max is not None and len(chunk) > max:
58
+ sub_chunks = [
59
+ chunk[i : i + max + margin]
60
+ for i in range(0, len(chunk) - margin, max)
61
+ ]
62
+
63
+ if len(sub_chunks[-1]) < min:
64
+ if padding and len(sub_chunks) > 2:
65
+ output_chunks.extend(sub_chunks[0:-2])
66
+ output_chunks.append(sub_chunks[-2] + sub_chunks[-1])
67
+ else:
68
+ output_chunks.extend(sub_chunks[0:-1])
69
+ else:
70
+ output_chunks.extend(sub_chunks)
71
+ else:
72
+ output_chunks.append(chunk)
73
+ else:
74
+ if so_short is None:
75
+ so_short = chunk
76
+ else:
77
+ so_short += chunk
78
+ basename = os.path.splitext(os.path.basename(file))[0]
79
+
80
+ for i, chunk in enumerate(output_chunks):
81
+ filepath = os.path.join(output, f"{basename}_{i}.wav")
82
+ chunk.export(filepath, format="wav")
modules/server/model.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from typing import *
4
+
5
+ import faiss
6
+ import numpy as np
7
+ import pyworld
8
+ import scipy.signal as signal
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torchaudio
12
+ import torchcrepe
13
+ from fairseq import checkpoint_utils
14
+ from fairseq.models.hubert.hubert import HubertModel
15
+ from pydub import AudioSegment
16
+ from torch import Tensor
17
+
18
+ from lib.rvc.models import (SynthesizerTrnMs256NSFSid,
19
+ SynthesizerTrnMs256NSFSidNono)
20
+ from lib.rvc.pipeline import VocalConvertPipeline
21
+ from modules.cmd_opts import opts
22
+ from modules.models import (EMBEDDINGS_LIST, MODELS_DIR, get_embedder,
23
+ get_vc_model, update_state_dict)
24
+ from modules.shared import ROOT_DIR, device, is_half
25
+
26
+ MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models")
27
+ vc_model: Optional["VoiceServerModel"] = None
28
+ embedder_model: Optional[HubertModel] = None
29
+ loaded_embedder_model = ""
30
+
31
+
32
+ class VoiceServerModel:
33
+ def __init__(self, rvc_model_file: str, faiss_index_file: str) -> None:
34
+ # setting vram
35
+ global device, is_half
36
+ if isinstance(device, str):
37
+ device = torch.device(device)
38
+ if device.type == "cuda":
39
+ vram = torch.cuda.get_device_properties(device).total_memory / 1024**3
40
+ else:
41
+ vram = None
42
+ if vram is not None and vram <= 4:
43
+ self.x_pad = 1
44
+ self.x_query = 5
45
+ self.x_center = 30
46
+ self.x_max = 32
47
+ elif vram is not None and vram <= 5:
48
+ self.x_pad = 1
49
+ self.x_query = 6
50
+ self.x_center = 38
51
+ self.x_max = 41
52
+ else:
53
+ self.x_pad = 3
54
+ self.x_query = 10
55
+ self.x_center = 60
56
+ self.x_max = 65
57
+
58
+ # load_model
59
+ state_dict = torch.load(rvc_model_file, map_location="cpu")
60
+ update_state_dict(state_dict)
61
+ self.state_dict = state_dict
62
+ self.tgt_sr = state_dict["params"]["sr"]
63
+ self.f0 = state_dict.get("f0", 1)
64
+ state_dict["params"]["spk_embed_dim"] = state_dict["weight"][
65
+ "emb_g.weight"
66
+ ].shape[0]
67
+ if not "emb_channels" in state_dict["params"]:
68
+ if state_dict.get("version", "v1") == "v1":
69
+ state_dict["params"]["emb_channels"] = 256 # for backward compat.
70
+ state_dict["embedder_output_layer"] = 9
71
+ else:
72
+ state_dict["params"]["emb_channels"] = 768 # for backward compat.
73
+ state_dict["embedder_output_layer"] = 12
74
+ if self.f0 == 1:
75
+ self.net_g = SynthesizerTrnMs256NSFSid(
76
+ **state_dict["params"], is_half=is_half
77
+ )
78
+ else:
79
+ self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"])
80
+ del self.net_g.enc_q
81
+ self.net_g.load_state_dict(state_dict["weight"], strict=False)
82
+ self.net_g.eval().to(device)
83
+ if is_half:
84
+ self.net_g = self.net_g.half()
85
+ else:
86
+ self.net_g = self.net_g.float()
87
+
88
+ emb_name = state_dict.get("embedder_name", "contentvec")
89
+ if emb_name == "hubert_base":
90
+ emb_name = "contentvec"
91
+ emb_file = os.path.join(MODELS_DIR, "embeddings", EMBEDDINGS_LIST[emb_name][0])
92
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
93
+ [emb_file],
94
+ suffix="",
95
+ )
96
+ embedder_model = models[0]
97
+ embedder_model = embedder_model.to(device)
98
+
99
+ if is_half:
100
+ embedder_model = embedder_model.half()
101
+ else:
102
+ embedder_model = embedder_model.float()
103
+ embedder_model.eval()
104
+ self.embedder_model = embedder_model
105
+
106
+ self.embedder_output_layer = state_dict["embedder_output_layer"]
107
+
108
+ self.index = None
109
+ if faiss_index_file != "" and os.path.exists(faiss_index_file):
110
+ self.index = faiss.read_index(faiss_index_file)
111
+ self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
112
+
113
+ self.n_spk = state_dict["params"]["spk_embed_dim"]
114
+
115
+ self.sr = 16000 # hubert input sample rate
116
+ self.window = 160 # hubert input window
117
+ self.t_pad = self.sr * self.x_pad # padding time for each utterance
118
+ self.t_pad_tgt = self.tgt_sr * self.x_pad
119
+ self.t_pad2 = self.t_pad * 2
120
+ self.t_query = self.sr * self.x_query # query time before and after query point
121
+ self.t_center = self.sr * self.x_center # query cut point position
122
+ self.t_max = self.sr * self.x_max # max time for no query
123
+ self.device = device
124
+ self.is_half = is_half
125
+
126
+ def __call__(
127
+ self,
128
+ audio: np.ndarray,
129
+ sr: int,
130
+ sid: int,
131
+ transpose: int,
132
+ f0_method: str,
133
+ index_rate: float,
134
+ ):
135
+ # bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
136
+ # audio = signal.filtfilt(bh, ah, audio)
137
+ if sr != self.sr:
138
+ audio = torchaudio.functional.resample(torch.from_numpy(audio), sr, self.sr, rolloff=0.99).detach().cpu().numpy()
139
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect" if audio.shape[0] > self.window // 2 else "constant")
140
+
141
+ opt_ts = []
142
+ if audio_pad.shape[0] > self.t_max:
143
+ audio_sum = np.zeros_like(audio)
144
+ for i in range(self.window):
145
+ audio_sum += audio_pad[i : i - self.window]
146
+ for t in range(self.t_center, audio.shape[0], self.t_center):
147
+ opt_ts.append(
148
+ t
149
+ - self.t_query
150
+ + np.where(
151
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
152
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
153
+ )[0][0]
154
+ )
155
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect" if audio.shape[0] > self.t_pad else "constant")
156
+ p_len = audio_pad.shape[0] // self.window
157
+
158
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
159
+ pitch, pitchf = None, None
160
+ if self.f0 == 1:
161
+ pitch, pitchf = get_f0(audio_pad, self.sr, p_len, transpose, f0_method)
162
+ pitch = pitch[:p_len]
163
+ pitchf = pitchf[:p_len]
164
+ if self.device.type == "mps":
165
+ pitchf = pitchf.astype(np.float32)
166
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
167
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
168
+
169
+ audio_opt = []
170
+
171
+ s = 0
172
+ t = None
173
+
174
+ for t in opt_ts:
175
+ t = t // self.window * self.window
176
+ if self.f0 == 1:
177
+ audio_opt.append(
178
+ self._convert(
179
+ sid,
180
+ audio_pad[s : t + self.t_pad2 + self.window],
181
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
182
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
183
+ index_rate,
184
+ )[self.t_pad_tgt : -self.t_pad_tgt]
185
+ )
186
+ else:
187
+ audio_opt.append(
188
+ self._convert(
189
+ sid,
190
+ audio_pad[s : t + self.t_pad2 + self.window],
191
+ None,
192
+ None,
193
+ index_rate,
194
+ )[self.t_pad_tgt : -self.t_pad_tgt]
195
+ )
196
+ s = t
197
+ if self.f0 == 1:
198
+ audio_opt.append(
199
+ self._convert(
200
+ sid,
201
+ audio_pad[t:],
202
+ pitch[:, t // self.window :] if t is not None else pitch,
203
+ pitchf[:, t // self.window :] if t is not None else pitchf,
204
+ index_rate,
205
+ )[self.t_pad_tgt : -self.t_pad_tgt]
206
+ )
207
+ else:
208
+ audio_opt.append(
209
+ self._convert(
210
+ sid,
211
+ audio_pad[t:],
212
+ None,
213
+ None,
214
+ index_rate,
215
+ )[self.t_pad_tgt : -self.t_pad_tgt]
216
+ )
217
+ audio_opt = np.concatenate(audio_opt)
218
+ del pitch, pitchf, sid
219
+ if torch.cuda.is_available():
220
+ torch.cuda.empty_cache()
221
+ return audio_opt
222
+
223
+
224
+ def _convert(
225
+ self,
226
+ sid: int,
227
+ audio: np.ndarray,
228
+ pitch: Optional[np.ndarray],
229
+ pitchf: Optional[np.ndarray],
230
+ index_rate: float,
231
+ ):
232
+ feats = torch.from_numpy(audio)
233
+ if self.is_half:
234
+ feats = feats.half()
235
+ else:
236
+ feats = feats.float()
237
+ if feats.dim() == 2: # double channels
238
+ feats = feats.mean(-1)
239
+ assert feats.dim() == 1, feats.dim()
240
+ feats = feats.view(1, -1)
241
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
242
+
243
+ half_support = (
244
+ self.device.type == "cuda"
245
+ and torch.cuda.get_device_capability(self.device)[0] >= 5.3
246
+ )
247
+ is_feats_dim_768 = self.net_g.emb_channels == 768
248
+
249
+ if isinstance(self.embedder_model, tuple):
250
+ feats = self.embedder_model[0](
251
+ feats.squeeze(0).squeeze(0).to(self.device),
252
+ return_tensors="pt",
253
+ sampling_rate=16000,
254
+ )
255
+ if self.is_half:
256
+ feats = feats.input_values.to(self.device).half()
257
+ else:
258
+ feats = feats.input_values.to(self.device)
259
+ with torch.no_grad():
260
+ if is_feats_dim_768:
261
+ feats = self.embedder_model[1](feats).last_hidden_state
262
+ else:
263
+ feats = self.embedder_model[1](feats).extract_features
264
+ else:
265
+ inputs = {
266
+ "source": feats.half().to(self.device)
267
+ if half_support
268
+ else feats.to(self.device),
269
+ "padding_mask": padding_mask.to(self.device),
270
+ "output_layer": self.embedder_output_layer,
271
+ }
272
+
273
+ if not half_support:
274
+ self.embedder_model = self.embedder_model.float()
275
+ inputs["source"] = inputs["source"].float()
276
+
277
+ with torch.no_grad():
278
+ logits = self.embedder_model.extract_features(**inputs)
279
+ if is_feats_dim_768:
280
+ feats = logits[0]
281
+ else:
282
+ feats = self.embedder_model.final_proj(logits[0])
283
+
284
+ if (
285
+ isinstance(self.index, type(None)) == False
286
+ and isinstance(self.big_npy, type(None)) == False
287
+ and index_rate != 0
288
+ ):
289
+ npy = feats[0].cpu().numpy()
290
+ if self.is_half:
291
+ npy = npy.astype("float32")
292
+
293
+ _, ix = self.index.search(npy, k=1)
294
+ npy = self.big_npy[ix[:, 0]]
295
+
296
+ if self.is_half:
297
+ npy = npy.astype("float16")
298
+ feats = (
299
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
300
+ + (1 - index_rate) * feats
301
+ )
302
+
303
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
304
+
305
+ p_len = audio.shape[0] // self.window
306
+ if feats.shape[1] < p_len:
307
+ p_len = feats.shape[1]
308
+ if pitch != None and pitchf != None:
309
+ pitch = pitch[:, :p_len]
310
+ pitchf = pitchf[:, :p_len]
311
+ p_len = torch.tensor([p_len], device=self.device).long()
312
+ with torch.no_grad():
313
+ if pitch != None and pitchf != None:
314
+ audio1 = (
315
+ (self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
316
+ .data.cpu()
317
+ .float()
318
+ .numpy()
319
+ .astype(np.int16)
320
+ )
321
+ else:
322
+ audio1 = (
323
+ (self.net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
324
+ .data.cpu()
325
+ .float()
326
+ .numpy()
327
+ .astype(np.int16)
328
+ )
329
+ del feats, p_len, padding_mask
330
+ if torch.cuda.is_available():
331
+ torch.cuda.empty_cache()
332
+ return audio1
333
+
334
+
335
+ # F0 computation
336
+ def get_f0_crepe_computation(
337
+ x,
338
+ sr,
339
+ f0_min,
340
+ f0_max,
341
+ p_len,
342
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
343
+ ):
344
+ hop_length = sr // 100
345
+ x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
346
+ x /= np.quantile(np.abs(x), 0.999)
347
+ torch_device = self.get_optimal_torch_device()
348
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
349
+ audio = torch.unsqueeze(audio, dim=0)
350
+ if audio.ndim == 2 and audio.shape[0] > 1:
351
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
352
+ audio = audio.detach()
353
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
354
+ pitch: Tensor = torchcrepe.predict(
355
+ audio,
356
+ sr,
357
+ sr // 100,
358
+ f0_min,
359
+ f0_max,
360
+ model,
361
+ batch_size=hop_length * 2,
362
+ device=torch_device,
363
+ pad=True
364
+ )
365
+ p_len = p_len or x.shape[0] // hop_length
366
+ # Resize the pitch for final f0
367
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
368
+ source[source < 0.001] = np.nan
369
+ target = np.interp(
370
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
371
+ np.arange(0, len(source)),
372
+ source
373
+ )
374
+ f0 = np.nan_to_num(target)
375
+ return f0 # Resized f0
376
+
377
+ def get_f0_official_crepe_computation(
378
+ x,
379
+ sr,
380
+ f0_min,
381
+ f0_max,
382
+ model="full",
383
+ ):
384
+ # Pick a batch size that doesn't cause memory errors on your gpu
385
+ batch_size = 512
386
+ # Compute pitch using first gpu
387
+ audio = torch.tensor(np.copy(x))[None].float()
388
+ f0, pd = torchcrepe.predict(
389
+ audio,
390
+ sr,
391
+ sr // 100,
392
+ f0_min,
393
+ f0_max,
394
+ model,
395
+ batch_size=batch_size,
396
+ device=device,
397
+ return_periodicity=True,
398
+ )
399
+ pd = torchcrepe.filter.median(pd, 3)
400
+ f0 = torchcrepe.filter.mean(f0, 3)
401
+ f0[pd < 0.1] = 0
402
+ f0 = f0[0].cpu().numpy()
403
+ return f0
404
+
405
+ def get_f0(
406
+ x: np.ndarray,
407
+ sr: int,
408
+ p_len: int,
409
+ f0_up_key: int,
410
+ f0_method: str,
411
+ ):
412
+ f0_min = 50
413
+ f0_max = 1100
414
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
415
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
416
+
417
+ if f0_method == "harvest":
418
+ f0, t = pyworld.harvest(
419
+ x.astype(np.double),
420
+ fs=sr,
421
+ f0_ceil=f0_max,
422
+ f0_floor=f0_min,
423
+ frame_period=10,
424
+ )
425
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr)
426
+ f0 = signal.medfilt(f0, 3)
427
+ elif f0_method == "dio":
428
+ f0, t = pyworld.dio(
429
+ x.astype(np.double),
430
+ fs=sr,
431
+ f0_ceil=f0_max,
432
+ f0_floor=f0_min,
433
+ frame_period=10,
434
+ )
435
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr)
436
+ f0 = signal.medfilt(f0, 3)
437
+ elif f0_method == "mangio-crepe":
438
+ f0 = get_f0_crepe_computation(x, sr, f0_min, f0_max, p_len, "full")
439
+ elif f0_method == "crepe":
440
+ f0 = get_f0_official_crepe_computation(x, sr, f0_min, f0_max, "full")
441
+
442
+ f0 *= pow(2, f0_up_key / 12)
443
+ f0bak = f0.copy()
444
+ f0_mel = 1127 * np.log(1 + f0 / 700)
445
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
446
+ f0_mel_max - f0_mel_min
447
+ ) + 1
448
+ f0_mel[f0_mel <= 1] = 1
449
+ f0_mel[f0_mel > 255] = 255
450
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
451
+ return f0_coarse, f0bak # 1-0
modules/shared.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import torch
5
+
6
+ from modules.cmd_opts import opts
7
+
8
+ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9
+ MODELS_DIR = os.path.join(ROOT_DIR, "models")
10
+
11
+
12
+ def has_mps():
13
+ if sys.platform != "darwin":
14
+ return False
15
+ else:
16
+ if not getattr(torch, "has_mps", False):
17
+ return False
18
+ try:
19
+ torch.zeros(1).to(torch.device("mps"))
20
+ return True
21
+ except Exception:
22
+ return False
23
+
24
+
25
+ is_half = opts.precision == "fp16"
26
+ half_support = (
27
+ torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 5.3
28
+ )
29
+
30
+ if not half_support:
31
+ print("WARNING: FP16 is not supported on this GPU")
32
+ is_half = False
33
+
34
+ device = "cuda:0"
35
+
36
+ if not torch.cuda.is_available():
37
+ if has_mps():
38
+ print("Using MPS")
39
+ device = "mps"
40
+ else:
41
+ print("Using CPU")
42
+ device = "cpu"
43
+
44
+ device = torch.device(device)