Siratish commited on
Commit
9cf748a
·
1 Parent(s): 2ef1c4a
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. F5-TTS/LICENSE +21 -0
  2. F5-TTS/ckpts/README.md +3 -0
  3. F5-TTS/data/Emilia_ZH_EN_pinyin/vocab.txt +2545 -0
  4. F5-TTS/data/librispeech_pc_test_clean_cross_sentence.lst +0 -0
  5. F5-TTS/src/f5_tts/api.py +164 -0
  6. F5-TTS/src/f5_tts/configs/E2TTS_Base.yaml +49 -0
  7. F5-TTS/src/f5_tts/configs/E2TTS_Small.yaml +49 -0
  8. F5-TTS/src/f5_tts/configs/F5TTS_Base.yaml +54 -0
  9. F5-TTS/src/f5_tts/configs/F5TTS_Small.yaml +54 -0
  10. F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml +55 -0
  11. F5-TTS/src/f5_tts/eval/README.md +52 -0
  12. F5-TTS/src/f5_tts/eval/ecapa_tdnn.py +331 -0
  13. F5-TTS/src/f5_tts/eval/eval_infer_batch.py +210 -0
  14. F5-TTS/src/f5_tts/eval/eval_infer_batch.sh +18 -0
  15. F5-TTS/src/f5_tts/eval/eval_librispeech_test_clean.py +89 -0
  16. F5-TTS/src/f5_tts/eval/eval_seedtts_testset.py +88 -0
  17. F5-TTS/src/f5_tts/eval/eval_utmos.py +42 -0
  18. F5-TTS/src/f5_tts/eval/utils_eval.py +419 -0
  19. F5-TTS/src/f5_tts/infer/README.md +177 -0
  20. F5-TTS/src/f5_tts/infer/SHARED.md +193 -0
  21. F5-TTS/src/f5_tts/infer/__pycache__/utils_infer.cpython-311.pyc +0 -0
  22. F5-TTS/src/f5_tts/infer/examples/basic/basic.toml +11 -0
  23. F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav +3 -0
  24. F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav +3 -0
  25. F5-TTS/src/f5_tts/infer/examples/multi/country.flac +3 -0
  26. F5-TTS/src/f5_tts/infer/examples/multi/main.flac +3 -0
  27. F5-TTS/src/f5_tts/infer/examples/multi/story.toml +20 -0
  28. F5-TTS/src/f5_tts/infer/examples/multi/story.txt +1 -0
  29. F5-TTS/src/f5_tts/infer/examples/multi/town.flac +3 -0
  30. F5-TTS/src/f5_tts/infer/examples/vocab.txt +2545 -0
  31. F5-TTS/src/f5_tts/infer/infer_cli.py +383 -0
  32. F5-TTS/src/f5_tts/infer/infer_gradio.py +1121 -0
  33. F5-TTS/src/f5_tts/infer/speech_edit.py +205 -0
  34. F5-TTS/src/f5_tts/infer/utils_infer.py +605 -0
  35. F5-TTS/src/f5_tts/model/__init__.py +8 -0
  36. F5-TTS/src/f5_tts/model/backbones/README.md +20 -0
  37. F5-TTS/src/f5_tts/model/backbones/dit.py +259 -0
  38. F5-TTS/src/f5_tts/model/backbones/mmdit.py +212 -0
  39. F5-TTS/src/f5_tts/model/backbones/unett.py +273 -0
  40. F5-TTS/src/f5_tts/model/cfm.py +302 -0
  41. F5-TTS/src/f5_tts/model/dataset.py +330 -0
  42. F5-TTS/src/f5_tts/model/modules.py +784 -0
  43. F5-TTS/src/f5_tts/model/trainer.py +439 -0
  44. F5-TTS/src/f5_tts/model/utils.py +220 -0
  45. F5-TTS/src/f5_tts/runtime/triton_trtllm/Dockerfile.server +3 -0
  46. F5-TTS/src/f5_tts/runtime/triton_trtllm/README.md +69 -0
  47. F5-TTS/src/f5_tts/runtime/triton_trtllm/benchmark.py +560 -0
  48. F5-TTS/src/f5_tts/runtime/triton_trtllm/client_grpc.py +470 -0
  49. F5-TTS/src/f5_tts/runtime/triton_trtllm/client_http.py +143 -0
  50. F5-TTS/src/f5_tts/runtime/triton_trtllm/docker-compose.yml +20 -0
F5-TTS/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yushen CHEN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
F5-TTS/ckpts/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ The pretrained model checkpoints can be reached at https://huggingface.co/SWivid/F5-TTS.
2
+
3
+ Scripts will automatically pull model checkpoints from Huggingface, by default to `~/.cache/huggingface/hub/`.
F5-TTS/data/Emilia_ZH_EN_pinyin/vocab.txt ADDED
@@ -0,0 +1,2545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #
5
+ $
6
+ %
7
+ &
8
+ '
9
+ (
10
+ )
11
+ *
12
+ +
13
+ ,
14
+ -
15
+ .
16
+ /
17
+ 0
18
+ 1
19
+ 2
20
+ 3
21
+ 4
22
+ 5
23
+ 6
24
+ 7
25
+ 8
26
+ 9
27
+ :
28
+ ;
29
+ =
30
+ >
31
+ ?
32
+ @
33
+ A
34
+ B
35
+ C
36
+ D
37
+ E
38
+ F
39
+ G
40
+ H
41
+ I
42
+ J
43
+ K
44
+ L
45
+ M
46
+ N
47
+ O
48
+ P
49
+ Q
50
+ R
51
+ S
52
+ T
53
+ U
54
+ V
55
+ W
56
+ X
57
+ Y
58
+ Z
59
+ [
60
+ \
61
+ ]
62
+ _
63
+ a
64
+ a1
65
+ ai1
66
+ ai2
67
+ ai3
68
+ ai4
69
+ an1
70
+ an3
71
+ an4
72
+ ang1
73
+ ang2
74
+ ang4
75
+ ao1
76
+ ao2
77
+ ao3
78
+ ao4
79
+ b
80
+ ba
81
+ ba1
82
+ ba2
83
+ ba3
84
+ ba4
85
+ bai1
86
+ bai2
87
+ bai3
88
+ bai4
89
+ ban1
90
+ ban2
91
+ ban3
92
+ ban4
93
+ bang1
94
+ bang2
95
+ bang3
96
+ bang4
97
+ bao1
98
+ bao2
99
+ bao3
100
+ bao4
101
+ bei
102
+ bei1
103
+ bei2
104
+ bei3
105
+ bei4
106
+ ben1
107
+ ben2
108
+ ben3
109
+ ben4
110
+ beng
111
+ beng1
112
+ beng2
113
+ beng3
114
+ beng4
115
+ bi1
116
+ bi2
117
+ bi3
118
+ bi4
119
+ bian1
120
+ bian2
121
+ bian3
122
+ bian4
123
+ biao1
124
+ biao2
125
+ biao3
126
+ bie1
127
+ bie2
128
+ bie3
129
+ bie4
130
+ bin1
131
+ bin4
132
+ bing1
133
+ bing2
134
+ bing3
135
+ bing4
136
+ bo
137
+ bo1
138
+ bo2
139
+ bo3
140
+ bo4
141
+ bu2
142
+ bu3
143
+ bu4
144
+ c
145
+ ca1
146
+ cai1
147
+ cai2
148
+ cai3
149
+ cai4
150
+ can1
151
+ can2
152
+ can3
153
+ can4
154
+ cang1
155
+ cang2
156
+ cao1
157
+ cao2
158
+ cao3
159
+ ce4
160
+ cen1
161
+ cen2
162
+ ceng1
163
+ ceng2
164
+ ceng4
165
+ cha1
166
+ cha2
167
+ cha3
168
+ cha4
169
+ chai1
170
+ chai2
171
+ chan1
172
+ chan2
173
+ chan3
174
+ chan4
175
+ chang1
176
+ chang2
177
+ chang3
178
+ chang4
179
+ chao1
180
+ chao2
181
+ chao3
182
+ che1
183
+ che2
184
+ che3
185
+ che4
186
+ chen1
187
+ chen2
188
+ chen3
189
+ chen4
190
+ cheng1
191
+ cheng2
192
+ cheng3
193
+ cheng4
194
+ chi1
195
+ chi2
196
+ chi3
197
+ chi4
198
+ chong1
199
+ chong2
200
+ chong3
201
+ chong4
202
+ chou1
203
+ chou2
204
+ chou3
205
+ chou4
206
+ chu1
207
+ chu2
208
+ chu3
209
+ chu4
210
+ chua1
211
+ chuai1
212
+ chuai2
213
+ chuai3
214
+ chuai4
215
+ chuan1
216
+ chuan2
217
+ chuan3
218
+ chuan4
219
+ chuang1
220
+ chuang2
221
+ chuang3
222
+ chuang4
223
+ chui1
224
+ chui2
225
+ chun1
226
+ chun2
227
+ chun3
228
+ chuo1
229
+ chuo4
230
+ ci1
231
+ ci2
232
+ ci3
233
+ ci4
234
+ cong1
235
+ cong2
236
+ cou4
237
+ cu1
238
+ cu4
239
+ cuan1
240
+ cuan2
241
+ cuan4
242
+ cui1
243
+ cui3
244
+ cui4
245
+ cun1
246
+ cun2
247
+ cun4
248
+ cuo1
249
+ cuo2
250
+ cuo4
251
+ d
252
+ da
253
+ da1
254
+ da2
255
+ da3
256
+ da4
257
+ dai1
258
+ dai2
259
+ dai3
260
+ dai4
261
+ dan1
262
+ dan2
263
+ dan3
264
+ dan4
265
+ dang1
266
+ dang2
267
+ dang3
268
+ dang4
269
+ dao1
270
+ dao2
271
+ dao3
272
+ dao4
273
+ de
274
+ de1
275
+ de2
276
+ dei3
277
+ den4
278
+ deng1
279
+ deng2
280
+ deng3
281
+ deng4
282
+ di1
283
+ di2
284
+ di3
285
+ di4
286
+ dia3
287
+ dian1
288
+ dian2
289
+ dian3
290
+ dian4
291
+ diao1
292
+ diao3
293
+ diao4
294
+ die1
295
+ die2
296
+ die4
297
+ ding1
298
+ ding2
299
+ ding3
300
+ ding4
301
+ diu1
302
+ dong1
303
+ dong3
304
+ dong4
305
+ dou1
306
+ dou2
307
+ dou3
308
+ dou4
309
+ du1
310
+ du2
311
+ du3
312
+ du4
313
+ duan1
314
+ duan2
315
+ duan3
316
+ duan4
317
+ dui1
318
+ dui4
319
+ dun1
320
+ dun3
321
+ dun4
322
+ duo1
323
+ duo2
324
+ duo3
325
+ duo4
326
+ e
327
+ e1
328
+ e2
329
+ e3
330
+ e4
331
+ ei2
332
+ en1
333
+ en4
334
+ er
335
+ er2
336
+ er3
337
+ er4
338
+ f
339
+ fa1
340
+ fa2
341
+ fa3
342
+ fa4
343
+ fan1
344
+ fan2
345
+ fan3
346
+ fan4
347
+ fang1
348
+ fang2
349
+ fang3
350
+ fang4
351
+ fei1
352
+ fei2
353
+ fei3
354
+ fei4
355
+ fen1
356
+ fen2
357
+ fen3
358
+ fen4
359
+ feng1
360
+ feng2
361
+ feng3
362
+ feng4
363
+ fo2
364
+ fou2
365
+ fou3
366
+ fu1
367
+ fu2
368
+ fu3
369
+ fu4
370
+ g
371
+ ga1
372
+ ga2
373
+ ga3
374
+ ga4
375
+ gai1
376
+ gai2
377
+ gai3
378
+ gai4
379
+ gan1
380
+ gan2
381
+ gan3
382
+ gan4
383
+ gang1
384
+ gang2
385
+ gang3
386
+ gang4
387
+ gao1
388
+ gao2
389
+ gao3
390
+ gao4
391
+ ge1
392
+ ge2
393
+ ge3
394
+ ge4
395
+ gei2
396
+ gei3
397
+ gen1
398
+ gen2
399
+ gen3
400
+ gen4
401
+ geng1
402
+ geng3
403
+ geng4
404
+ gong1
405
+ gong3
406
+ gong4
407
+ gou1
408
+ gou2
409
+ gou3
410
+ gou4
411
+ gu
412
+ gu1
413
+ gu2
414
+ gu3
415
+ gu4
416
+ gua1
417
+ gua2
418
+ gua3
419
+ gua4
420
+ guai1
421
+ guai2
422
+ guai3
423
+ guai4
424
+ guan1
425
+ guan2
426
+ guan3
427
+ guan4
428
+ guang1
429
+ guang2
430
+ guang3
431
+ guang4
432
+ gui1
433
+ gui2
434
+ gui3
435
+ gui4
436
+ gun3
437
+ gun4
438
+ guo1
439
+ guo2
440
+ guo3
441
+ guo4
442
+ h
443
+ ha1
444
+ ha2
445
+ ha3
446
+ hai1
447
+ hai2
448
+ hai3
449
+ hai4
450
+ han1
451
+ han2
452
+ han3
453
+ han4
454
+ hang1
455
+ hang2
456
+ hang4
457
+ hao1
458
+ hao2
459
+ hao3
460
+ hao4
461
+ he1
462
+ he2
463
+ he4
464
+ hei1
465
+ hen2
466
+ hen3
467
+ hen4
468
+ heng1
469
+ heng2
470
+ heng4
471
+ hong1
472
+ hong2
473
+ hong3
474
+ hong4
475
+ hou1
476
+ hou2
477
+ hou3
478
+ hou4
479
+ hu1
480
+ hu2
481
+ hu3
482
+ hu4
483
+ hua1
484
+ hua2
485
+ hua4
486
+ huai2
487
+ huai4
488
+ huan1
489
+ huan2
490
+ huan3
491
+ huan4
492
+ huang1
493
+ huang2
494
+ huang3
495
+ huang4
496
+ hui1
497
+ hui2
498
+ hui3
499
+ hui4
500
+ hun1
501
+ hun2
502
+ hun4
503
+ huo
504
+ huo1
505
+ huo2
506
+ huo3
507
+ huo4
508
+ i
509
+ j
510
+ ji1
511
+ ji2
512
+ ji3
513
+ ji4
514
+ jia
515
+ jia1
516
+ jia2
517
+ jia3
518
+ jia4
519
+ jian1
520
+ jian2
521
+ jian3
522
+ jian4
523
+ jiang1
524
+ jiang2
525
+ jiang3
526
+ jiang4
527
+ jiao1
528
+ jiao2
529
+ jiao3
530
+ jiao4
531
+ jie1
532
+ jie2
533
+ jie3
534
+ jie4
535
+ jin1
536
+ jin2
537
+ jin3
538
+ jin4
539
+ jing1
540
+ jing2
541
+ jing3
542
+ jing4
543
+ jiong3
544
+ jiu1
545
+ jiu2
546
+ jiu3
547
+ jiu4
548
+ ju1
549
+ ju2
550
+ ju3
551
+ ju4
552
+ juan1
553
+ juan2
554
+ juan3
555
+ juan4
556
+ jue1
557
+ jue2
558
+ jue4
559
+ jun1
560
+ jun4
561
+ k
562
+ ka1
563
+ ka2
564
+ ka3
565
+ kai1
566
+ kai2
567
+ kai3
568
+ kai4
569
+ kan1
570
+ kan2
571
+ kan3
572
+ kan4
573
+ kang1
574
+ kang2
575
+ kang4
576
+ kao1
577
+ kao2
578
+ kao3
579
+ kao4
580
+ ke1
581
+ ke2
582
+ ke3
583
+ ke4
584
+ ken3
585
+ keng1
586
+ kong1
587
+ kong3
588
+ kong4
589
+ kou1
590
+ kou2
591
+ kou3
592
+ kou4
593
+ ku1
594
+ ku2
595
+ ku3
596
+ ku4
597
+ kua1
598
+ kua3
599
+ kua4
600
+ kuai3
601
+ kuai4
602
+ kuan1
603
+ kuan2
604
+ kuan3
605
+ kuang1
606
+ kuang2
607
+ kuang4
608
+ kui1
609
+ kui2
610
+ kui3
611
+ kui4
612
+ kun1
613
+ kun3
614
+ kun4
615
+ kuo4
616
+ l
617
+ la
618
+ la1
619
+ la2
620
+ la3
621
+ la4
622
+ lai2
623
+ lai4
624
+ lan2
625
+ lan3
626
+ lan4
627
+ lang1
628
+ lang2
629
+ lang3
630
+ lang4
631
+ lao1
632
+ lao2
633
+ lao3
634
+ lao4
635
+ le
636
+ le1
637
+ le4
638
+ lei
639
+ lei1
640
+ lei2
641
+ lei3
642
+ lei4
643
+ leng1
644
+ leng2
645
+ leng3
646
+ leng4
647
+ li
648
+ li1
649
+ li2
650
+ li3
651
+ li4
652
+ lia3
653
+ lian2
654
+ lian3
655
+ lian4
656
+ liang2
657
+ liang3
658
+ liang4
659
+ liao1
660
+ liao2
661
+ liao3
662
+ liao4
663
+ lie1
664
+ lie2
665
+ lie3
666
+ lie4
667
+ lin1
668
+ lin2
669
+ lin3
670
+ lin4
671
+ ling2
672
+ ling3
673
+ ling4
674
+ liu1
675
+ liu2
676
+ liu3
677
+ liu4
678
+ long1
679
+ long2
680
+ long3
681
+ long4
682
+ lou1
683
+ lou2
684
+ lou3
685
+ lou4
686
+ lu1
687
+ lu2
688
+ lu3
689
+ lu4
690
+ luan2
691
+ luan3
692
+ luan4
693
+ lun1
694
+ lun2
695
+ lun4
696
+ luo1
697
+ luo2
698
+ luo3
699
+ luo4
700
+ lv2
701
+ lv3
702
+ lv4
703
+ lve3
704
+ lve4
705
+ m
706
+ ma
707
+ ma1
708
+ ma2
709
+ ma3
710
+ ma4
711
+ mai2
712
+ mai3
713
+ mai4
714
+ man1
715
+ man2
716
+ man3
717
+ man4
718
+ mang2
719
+ mang3
720
+ mao1
721
+ mao2
722
+ mao3
723
+ mao4
724
+ me
725
+ mei2
726
+ mei3
727
+ mei4
728
+ men
729
+ men1
730
+ men2
731
+ men4
732
+ meng
733
+ meng1
734
+ meng2
735
+ meng3
736
+ meng4
737
+ mi1
738
+ mi2
739
+ mi3
740
+ mi4
741
+ mian2
742
+ mian3
743
+ mian4
744
+ miao1
745
+ miao2
746
+ miao3
747
+ miao4
748
+ mie1
749
+ mie4
750
+ min2
751
+ min3
752
+ ming2
753
+ ming3
754
+ ming4
755
+ miu4
756
+ mo1
757
+ mo2
758
+ mo3
759
+ mo4
760
+ mou1
761
+ mou2
762
+ mou3
763
+ mu2
764
+ mu3
765
+ mu4
766
+ n
767
+ n2
768
+ na1
769
+ na2
770
+ na3
771
+ na4
772
+ nai2
773
+ nai3
774
+ nai4
775
+ nan1
776
+ nan2
777
+ nan3
778
+ nan4
779
+ nang1
780
+ nang2
781
+ nang3
782
+ nao1
783
+ nao2
784
+ nao3
785
+ nao4
786
+ ne
787
+ ne2
788
+ ne4
789
+ nei3
790
+ nei4
791
+ nen4
792
+ neng2
793
+ ni1
794
+ ni2
795
+ ni3
796
+ ni4
797
+ nian1
798
+ nian2
799
+ nian3
800
+ nian4
801
+ niang2
802
+ niang4
803
+ niao2
804
+ niao3
805
+ niao4
806
+ nie1
807
+ nie4
808
+ nin2
809
+ ning2
810
+ ning3
811
+ ning4
812
+ niu1
813
+ niu2
814
+ niu3
815
+ niu4
816
+ nong2
817
+ nong4
818
+ nou4
819
+ nu2
820
+ nu3
821
+ nu4
822
+ nuan3
823
+ nuo2
824
+ nuo4
825
+ nv2
826
+ nv3
827
+ nve4
828
+ o
829
+ o1
830
+ o2
831
+ ou1
832
+ ou2
833
+ ou3
834
+ ou4
835
+ p
836
+ pa1
837
+ pa2
838
+ pa4
839
+ pai1
840
+ pai2
841
+ pai3
842
+ pai4
843
+ pan1
844
+ pan2
845
+ pan4
846
+ pang1
847
+ pang2
848
+ pang4
849
+ pao1
850
+ pao2
851
+ pao3
852
+ pao4
853
+ pei1
854
+ pei2
855
+ pei4
856
+ pen1
857
+ pen2
858
+ pen4
859
+ peng1
860
+ peng2
861
+ peng3
862
+ peng4
863
+ pi1
864
+ pi2
865
+ pi3
866
+ pi4
867
+ pian1
868
+ pian2
869
+ pian4
870
+ piao1
871
+ piao2
872
+ piao3
873
+ piao4
874
+ pie1
875
+ pie2
876
+ pie3
877
+ pin1
878
+ pin2
879
+ pin3
880
+ pin4
881
+ ping1
882
+ ping2
883
+ po1
884
+ po2
885
+ po3
886
+ po4
887
+ pou1
888
+ pu1
889
+ pu2
890
+ pu3
891
+ pu4
892
+ q
893
+ qi1
894
+ qi2
895
+ qi3
896
+ qi4
897
+ qia1
898
+ qia3
899
+ qia4
900
+ qian1
901
+ qian2
902
+ qian3
903
+ qian4
904
+ qiang1
905
+ qiang2
906
+ qiang3
907
+ qiang4
908
+ qiao1
909
+ qiao2
910
+ qiao3
911
+ qiao4
912
+ qie1
913
+ qie2
914
+ qie3
915
+ qie4
916
+ qin1
917
+ qin2
918
+ qin3
919
+ qin4
920
+ qing1
921
+ qing2
922
+ qing3
923
+ qing4
924
+ qiong1
925
+ qiong2
926
+ qiu1
927
+ qiu2
928
+ qiu3
929
+ qu1
930
+ qu2
931
+ qu3
932
+ qu4
933
+ quan1
934
+ quan2
935
+ quan3
936
+ quan4
937
+ que1
938
+ que2
939
+ que4
940
+ qun2
941
+ r
942
+ ran2
943
+ ran3
944
+ rang1
945
+ rang2
946
+ rang3
947
+ rang4
948
+ rao2
949
+ rao3
950
+ rao4
951
+ re2
952
+ re3
953
+ re4
954
+ ren2
955
+ ren3
956
+ ren4
957
+ reng1
958
+ reng2
959
+ ri4
960
+ rong1
961
+ rong2
962
+ rong3
963
+ rou2
964
+ rou4
965
+ ru2
966
+ ru3
967
+ ru4
968
+ ruan2
969
+ ruan3
970
+ rui3
971
+ rui4
972
+ run4
973
+ ruo4
974
+ s
975
+ sa1
976
+ sa2
977
+ sa3
978
+ sa4
979
+ sai1
980
+ sai4
981
+ san1
982
+ san2
983
+ san3
984
+ san4
985
+ sang1
986
+ sang3
987
+ sang4
988
+ sao1
989
+ sao2
990
+ sao3
991
+ sao4
992
+ se4
993
+ sen1
994
+ seng1
995
+ sha1
996
+ sha2
997
+ sha3
998
+ sha4
999
+ shai1
1000
+ shai2
1001
+ shai3
1002
+ shai4
1003
+ shan1
1004
+ shan3
1005
+ shan4
1006
+ shang
1007
+ shang1
1008
+ shang3
1009
+ shang4
1010
+ shao1
1011
+ shao2
1012
+ shao3
1013
+ shao4
1014
+ she1
1015
+ she2
1016
+ she3
1017
+ she4
1018
+ shei2
1019
+ shen1
1020
+ shen2
1021
+ shen3
1022
+ shen4
1023
+ sheng1
1024
+ sheng2
1025
+ sheng3
1026
+ sheng4
1027
+ shi
1028
+ shi1
1029
+ shi2
1030
+ shi3
1031
+ shi4
1032
+ shou1
1033
+ shou2
1034
+ shou3
1035
+ shou4
1036
+ shu1
1037
+ shu2
1038
+ shu3
1039
+ shu4
1040
+ shua1
1041
+ shua2
1042
+ shua3
1043
+ shua4
1044
+ shuai1
1045
+ shuai3
1046
+ shuai4
1047
+ shuan1
1048
+ shuan4
1049
+ shuang1
1050
+ shuang3
1051
+ shui2
1052
+ shui3
1053
+ shui4
1054
+ shun3
1055
+ shun4
1056
+ shuo1
1057
+ shuo4
1058
+ si1
1059
+ si2
1060
+ si3
1061
+ si4
1062
+ song1
1063
+ song3
1064
+ song4
1065
+ sou1
1066
+ sou3
1067
+ sou4
1068
+ su1
1069
+ su2
1070
+ su4
1071
+ suan1
1072
+ suan4
1073
+ sui1
1074
+ sui2
1075
+ sui3
1076
+ sui4
1077
+ sun1
1078
+ sun3
1079
+ suo
1080
+ suo1
1081
+ suo2
1082
+ suo3
1083
+ t
1084
+ ta1
1085
+ ta2
1086
+ ta3
1087
+ ta4
1088
+ tai1
1089
+ tai2
1090
+ tai4
1091
+ tan1
1092
+ tan2
1093
+ tan3
1094
+ tan4
1095
+ tang1
1096
+ tang2
1097
+ tang3
1098
+ tang4
1099
+ tao1
1100
+ tao2
1101
+ tao3
1102
+ tao4
1103
+ te4
1104
+ teng2
1105
+ ti1
1106
+ ti2
1107
+ ti3
1108
+ ti4
1109
+ tian1
1110
+ tian2
1111
+ tian3
1112
+ tiao1
1113
+ tiao2
1114
+ tiao3
1115
+ tiao4
1116
+ tie1
1117
+ tie2
1118
+ tie3
1119
+ tie4
1120
+ ting1
1121
+ ting2
1122
+ ting3
1123
+ tong1
1124
+ tong2
1125
+ tong3
1126
+ tong4
1127
+ tou
1128
+ tou1
1129
+ tou2
1130
+ tou4
1131
+ tu1
1132
+ tu2
1133
+ tu3
1134
+ tu4
1135
+ tuan1
1136
+ tuan2
1137
+ tui1
1138
+ tui2
1139
+ tui3
1140
+ tui4
1141
+ tun1
1142
+ tun2
1143
+ tun4
1144
+ tuo1
1145
+ tuo2
1146
+ tuo3
1147
+ tuo4
1148
+ u
1149
+ v
1150
+ w
1151
+ wa
1152
+ wa1
1153
+ wa2
1154
+ wa3
1155
+ wa4
1156
+ wai1
1157
+ wai3
1158
+ wai4
1159
+ wan1
1160
+ wan2
1161
+ wan3
1162
+ wan4
1163
+ wang1
1164
+ wang2
1165
+ wang3
1166
+ wang4
1167
+ wei1
1168
+ wei2
1169
+ wei3
1170
+ wei4
1171
+ wen1
1172
+ wen2
1173
+ wen3
1174
+ wen4
1175
+ weng1
1176
+ weng4
1177
+ wo1
1178
+ wo2
1179
+ wo3
1180
+ wo4
1181
+ wu1
1182
+ wu2
1183
+ wu3
1184
+ wu4
1185
+ x
1186
+ xi1
1187
+ xi2
1188
+ xi3
1189
+ xi4
1190
+ xia1
1191
+ xia2
1192
+ xia4
1193
+ xian1
1194
+ xian2
1195
+ xian3
1196
+ xian4
1197
+ xiang1
1198
+ xiang2
1199
+ xiang3
1200
+ xiang4
1201
+ xiao1
1202
+ xiao2
1203
+ xiao3
1204
+ xiao4
1205
+ xie1
1206
+ xie2
1207
+ xie3
1208
+ xie4
1209
+ xin1
1210
+ xin2
1211
+ xin4
1212
+ xing1
1213
+ xing2
1214
+ xing3
1215
+ xing4
1216
+ xiong1
1217
+ xiong2
1218
+ xiu1
1219
+ xiu3
1220
+ xiu4
1221
+ xu
1222
+ xu1
1223
+ xu2
1224
+ xu3
1225
+ xu4
1226
+ xuan1
1227
+ xuan2
1228
+ xuan3
1229
+ xuan4
1230
+ xue1
1231
+ xue2
1232
+ xue3
1233
+ xue4
1234
+ xun1
1235
+ xun2
1236
+ xun4
1237
+ y
1238
+ ya
1239
+ ya1
1240
+ ya2
1241
+ ya3
1242
+ ya4
1243
+ yan1
1244
+ yan2
1245
+ yan3
1246
+ yan4
1247
+ yang1
1248
+ yang2
1249
+ yang3
1250
+ yang4
1251
+ yao1
1252
+ yao2
1253
+ yao3
1254
+ yao4
1255
+ ye1
1256
+ ye2
1257
+ ye3
1258
+ ye4
1259
+ yi
1260
+ yi1
1261
+ yi2
1262
+ yi3
1263
+ yi4
1264
+ yin1
1265
+ yin2
1266
+ yin3
1267
+ yin4
1268
+ ying1
1269
+ ying2
1270
+ ying3
1271
+ ying4
1272
+ yo1
1273
+ yong1
1274
+ yong2
1275
+ yong3
1276
+ yong4
1277
+ you1
1278
+ you2
1279
+ you3
1280
+ you4
1281
+ yu1
1282
+ yu2
1283
+ yu3
1284
+ yu4
1285
+ yuan1
1286
+ yuan2
1287
+ yuan3
1288
+ yuan4
1289
+ yue1
1290
+ yue4
1291
+ yun1
1292
+ yun2
1293
+ yun3
1294
+ yun4
1295
+ z
1296
+ za1
1297
+ za2
1298
+ za3
1299
+ zai1
1300
+ zai3
1301
+ zai4
1302
+ zan1
1303
+ zan2
1304
+ zan3
1305
+ zan4
1306
+ zang1
1307
+ zang4
1308
+ zao1
1309
+ zao2
1310
+ zao3
1311
+ zao4
1312
+ ze2
1313
+ ze4
1314
+ zei2
1315
+ zen3
1316
+ zeng1
1317
+ zeng4
1318
+ zha1
1319
+ zha2
1320
+ zha3
1321
+ zha4
1322
+ zhai1
1323
+ zhai2
1324
+ zhai3
1325
+ zhai4
1326
+ zhan1
1327
+ zhan2
1328
+ zhan3
1329
+ zhan4
1330
+ zhang1
1331
+ zhang2
1332
+ zhang3
1333
+ zhang4
1334
+ zhao1
1335
+ zhao2
1336
+ zhao3
1337
+ zhao4
1338
+ zhe
1339
+ zhe1
1340
+ zhe2
1341
+ zhe3
1342
+ zhe4
1343
+ zhen1
1344
+ zhen2
1345
+ zhen3
1346
+ zhen4
1347
+ zheng1
1348
+ zheng2
1349
+ zheng3
1350
+ zheng4
1351
+ zhi1
1352
+ zhi2
1353
+ zhi3
1354
+ zhi4
1355
+ zhong1
1356
+ zhong2
1357
+ zhong3
1358
+ zhong4
1359
+ zhou1
1360
+ zhou2
1361
+ zhou3
1362
+ zhou4
1363
+ zhu1
1364
+ zhu2
1365
+ zhu3
1366
+ zhu4
1367
+ zhua1
1368
+ zhua2
1369
+ zhua3
1370
+ zhuai1
1371
+ zhuai3
1372
+ zhuai4
1373
+ zhuan1
1374
+ zhuan2
1375
+ zhuan3
1376
+ zhuan4
1377
+ zhuang1
1378
+ zhuang4
1379
+ zhui1
1380
+ zhui4
1381
+ zhun1
1382
+ zhun2
1383
+ zhun3
1384
+ zhuo1
1385
+ zhuo2
1386
+ zi
1387
+ zi1
1388
+ zi2
1389
+ zi3
1390
+ zi4
1391
+ zong1
1392
+ zong2
1393
+ zong3
1394
+ zong4
1395
+ zou1
1396
+ zou2
1397
+ zou3
1398
+ zou4
1399
+ zu1
1400
+ zu2
1401
+ zu3
1402
+ zuan1
1403
+ zuan3
1404
+ zuan4
1405
+ zui2
1406
+ zui3
1407
+ zui4
1408
+ zun1
1409
+ zuo
1410
+ zuo1
1411
+ zuo2
1412
+ zuo3
1413
+ zuo4
1414
+ {
1415
+ ~
1416
+ ¡
1417
+ ¢
1418
+ £
1419
+ ¥
1420
+ §
1421
+ ¨
1422
+ ©
1423
+ «
1424
+ ®
1425
+ ¯
1426
+ °
1427
+ ±
1428
+ ²
1429
+ ³
1430
+ ´
1431
+ µ
1432
+ ·
1433
+ ¹
1434
+ º
1435
+ »
1436
+ ¼
1437
+ ½
1438
+ ¾
1439
+ ¿
1440
+ À
1441
+ Á
1442
+ Â
1443
+ Ã
1444
+ Ä
1445
+ Å
1446
+ Æ
1447
+ Ç
1448
+ È
1449
+ É
1450
+ Ê
1451
+ Í
1452
+ Î
1453
+ Ñ
1454
+ Ó
1455
+ Ö
1456
+ ×
1457
+ Ø
1458
+ Ú
1459
+ Ü
1460
+ Ý
1461
+ Þ
1462
+ ß
1463
+ à
1464
+ á
1465
+ â
1466
+ ã
1467
+ ä
1468
+ å
1469
+ æ
1470
+ ç
1471
+ è
1472
+ é
1473
+ ê
1474
+ ë
1475
+ ì
1476
+ í
1477
+ î
1478
+ ï
1479
+ ð
1480
+ ñ
1481
+ ò
1482
+ ó
1483
+ ô
1484
+ õ
1485
+ ö
1486
+ ø
1487
+ ù
1488
+ ú
1489
+ û
1490
+ ü
1491
+ ý
1492
+ Ā
1493
+ ā
1494
+ ă
1495
+ ą
1496
+ ć
1497
+ Č
1498
+ č
1499
+ Đ
1500
+ đ
1501
+ ē
1502
+ ė
1503
+ ę
1504
+ ě
1505
+ ĝ
1506
+ ğ
1507
+ ħ
1508
+ ī
1509
+ į
1510
+ İ
1511
+ ı
1512
+ Ł
1513
+ ł
1514
+ ń
1515
+ ņ
1516
+ ň
1517
+ ŋ
1518
+ Ō
1519
+ ō
1520
+ ő
1521
+ œ
1522
+ ř
1523
+ Ś
1524
+ ś
1525
+ Ş
1526
+ ş
1527
+ Š
1528
+ š
1529
+ Ť
1530
+ ť
1531
+ ũ
1532
+ ū
1533
+ ź
1534
+ Ż
1535
+ ż
1536
+ Ž
1537
+ ž
1538
+ ơ
1539
+ ư
1540
+ ǎ
1541
+ ǐ
1542
+ ǒ
1543
+ ǔ
1544
+ ǚ
1545
+ ș
1546
+ ț
1547
+ ɑ
1548
+ ɔ
1549
+ ɕ
1550
+ ə
1551
+ ɛ
1552
+ ɜ
1553
+ ɡ
1554
+ ɣ
1555
+ ɪ
1556
+ ɫ
1557
+ ɴ
1558
+ ɹ
1559
+ ɾ
1560
+ ʃ
1561
+ ʊ
1562
+ ʌ
1563
+ ʒ
1564
+ ʔ
1565
+ ʰ
1566
+ ʷ
1567
+ ʻ
1568
+ ʾ
1569
+ ʿ
1570
+ ˈ
1571
+ ː
1572
+ ˙
1573
+ ˜
1574
+ ˢ
1575
+ ́
1576
+ ̅
1577
+ Α
1578
+ Β
1579
+ Δ
1580
+ Ε
1581
+ Θ
1582
+ Κ
1583
+ Λ
1584
+ Μ
1585
+ Ξ
1586
+ Π
1587
+ Σ
1588
+ Τ
1589
+ Φ
1590
+ Χ
1591
+ Ψ
1592
+ Ω
1593
+ ά
1594
+ έ
1595
+ ή
1596
+ ί
1597
+ α
1598
+ β
1599
+ γ
1600
+ δ
1601
+ ε
1602
+ ζ
1603
+ η
1604
+ θ
1605
+ ι
1606
+ κ
1607
+ λ
1608
+ μ
1609
+ ν
1610
+ ξ
1611
+ ο
1612
+ π
1613
+ ρ
1614
+ ς
1615
+ σ
1616
+ τ
1617
+ υ
1618
+ φ
1619
+ χ
1620
+ ψ
1621
+ ω
1622
+ ϊ
1623
+ ό
1624
+ ύ
1625
+ ώ
1626
+ ϕ
1627
+ ϵ
1628
+ Ё
1629
+ А
1630
+ Б
1631
+ В
1632
+ Г
1633
+ Д
1634
+ Е
1635
+ Ж
1636
+ З
1637
+ И
1638
+ Й
1639
+ К
1640
+ Л
1641
+ М
1642
+ Н
1643
+ О
1644
+ П
1645
+ Р
1646
+ С
1647
+ Т
1648
+ У
1649
+ Ф
1650
+ Х
1651
+ Ц
1652
+ Ч
1653
+ Ш
1654
+ Щ
1655
+ Ы
1656
+ Ь
1657
+ Э
1658
+ Ю
1659
+ Я
1660
+ а
1661
+ б
1662
+ в
1663
+ г
1664
+ д
1665
+ е
1666
+ ж
1667
+ з
1668
+ и
1669
+ й
1670
+ к
1671
+ л
1672
+ м
1673
+ н
1674
+ о
1675
+ п
1676
+ р
1677
+ с
1678
+ т
1679
+ у
1680
+ ф
1681
+ х
1682
+ ц
1683
+ ч
1684
+ ш
1685
+ щ
1686
+ ъ
1687
+ ы
1688
+ ь
1689
+ э
1690
+ ю
1691
+ я
1692
+ ё
1693
+ і
1694
+ ְ
1695
+ ִ
1696
+ ֵ
1697
+ ֶ
1698
+ ַ
1699
+ ָ
1700
+ ֹ
1701
+ ּ
1702
+ ־
1703
+ ׁ
1704
+ א
1705
+ ב
1706
+ ג
1707
+ ד
1708
+ ה
1709
+ ו
1710
+ ז
1711
+ ח
1712
+ ט
1713
+ י
1714
+ כ
1715
+ ל
1716
+ ם
1717
+ מ
1718
+ ן
1719
+ נ
1720
+ ס
1721
+ ע
1722
+ פ
1723
+ ק
1724
+ ר
1725
+ ש
1726
+ ת
1727
+ أ
1728
+ ب
1729
+ ة
1730
+ ت
1731
+ ج
1732
+ ح
1733
+ د
1734
+ ر
1735
+ ز
1736
+ س
1737
+ ص
1738
+ ط
1739
+ ع
1740
+ ق
1741
+ ك
1742
+ ل
1743
+ م
1744
+ ن
1745
+ ه
1746
+ و
1747
+ ي
1748
+ َ
1749
+ ُ
1750
+ ِ
1751
+ ْ
1752
+
1753
+
1754
+
1755
+
1756
+
1757
+
1758
+
1759
+
1760
+
1761
+
1762
+
1763
+
1764
+
1765
+
1766
+
1767
+
1768
+
1769
+
1770
+
1771
+
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+
1782
+
1783
+
1784
+
1785
+
1786
+
1787
+
1788
+
1789
+
1790
+
1791
+
1792
+
1793
+
1794
+
1795
+
1796
+
1797
+
1798
+
1799
+
1800
+ ế
1801
+
1802
+
1803
+
1804
+
1805
+
1806
+
1807
+
1808
+
1809
+
1810
+
1811
+
1812
+
1813
+
1814
+
1815
+
1816
+
1817
+
1818
+
1819
+
1820
+
1821
+
1822
+
1823
+
1824
+
1825
+
1826
+
1827
+
1828
+
1829
+
1830
+ ���
1831
+
1832
+
1833
+
1834
+
1835
+
1836
+
1837
+
1838
+
1839
+
1840
+
1841
+
1842
+
1843
+
1844
+
1845
+
1846
+
1847
+
1848
+
1849
+
1850
+
1851
+
1852
+
1853
+
1854
+
1855
+
1856
+
1857
+
1858
+
1859
+
1860
+
1861
+
1862
+
1863
+
1864
+
1865
+
1866
+
1867
+
1868
+
1869
+
1870
+
1871
+
1872
+
1873
+
1874
+
1875
+
1876
+
1877
+
1878
+
1879
+
1880
+
1881
+
1882
+
1883
+
1884
+
1885
+
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+
1892
+
1893
+
1894
+
1895
+
1896
+
1897
+
1898
+
1899
+
1900
+
1901
+
1902
+
1903
+
1904
+
1905
+
1906
+
1907
+
1908
+
1909
+
1910
+
1911
+
1912
+
1913
+
1914
+
1915
+
1916
+
1917
+
1918
+
1919
+
1920
+
1921
+
1922
+
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
+
1930
+
1931
+
1932
+
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+
1950
+
1951
+
1952
+
1953
+
1954
+
1955
+
1956
+
1957
+
1958
+
1959
+
1960
+
1961
+
1962
+
1963
+
1964
+
1965
+
1966
+
1967
+
1968
+
1969
+
1970
+
1971
+
1972
+
1973
+
1974
+
1975
+
1976
+
1977
+
1978
+
1979
+
1980
+
1981
+
1982
+
1983
+
1984
+
1985
+
1986
+
1987
+
1988
+
1989
+
1990
+
1991
+
1992
+
1993
+
1994
+
1995
+
1996
+
1997
+
1998
+
1999
+
2000
+
2001
+
2002
+
2003
+
2004
+
2005
+
2006
+
2007
+
2008
+
2009
+
2010
+
2011
+
2012
+
2013
+
2014
+
2015
+
2016
+
2017
+
2018
+
2019
+
2020
+
2021
+
2022
+
2023
+
2024
+
2025
+
2026
+
2027
+
2028
+
2029
+
2030
+
2031
+
2032
+
2033
+
2034
+
2035
+
2036
+
2037
+
2038
+
2039
+
2040
+
2041
+
2042
+
2043
+
2044
+
2045
+
2046
+
2047
+
2048
+
2049
+
2050
+
2051
+
2052
+
2053
+
2054
+
2055
+
2056
+
2057
+
2058
+
2059
+
2060
+
2061
+
2062
+
2063
+
2064
+
2065
+
2066
+
2067
+
2068
+
2069
+
2070
+
2071
+
2072
+
2073
+
2074
+
2075
+
2076
+
2077
+
2078
+
2079
+
2080
+
2081
+
2082
+
2083
+
2084
+
2085
+
2086
+
2087
+
2088
+
2089
+
2090
+
2091
+
2092
+
2093
+
2094
+
2095
+
2096
+
2097
+
2098
+
2099
+
2100
+
2101
+
2102
+
2103
+
2104
+
2105
+
2106
+
2107
+
2108
+
2109
+
2110
+
2111
+
2112
+
2113
+
2114
+
2115
+
2116
+
2117
+
2118
+
2119
+
2120
+
2121
+
2122
+
2123
+
2124
+
2125
+
2126
+
2127
+
2128
+
2129
+
2130
+
2131
+
2132
+
2133
+
2134
+
2135
+
2136
+
2137
+
2138
+
2139
+
2140
+
2141
+
2142
+
2143
+
2144
+
2145
+
2146
+
2147
+
2148
+
2149
+
2150
+
2151
+
2152
+
2153
+
2154
+
2155
+
2156
+
2157
+
2158
+
2159
+
2160
+
2161
+
2162
+
2163
+
2164
+
2165
+
2166
+
2167
+
2168
+
2169
+
2170
+
2171
+
2172
+
2173
+
2174
+
2175
+
2176
+
2177
+
2178
+
2179
+
2180
+
2181
+
2182
+
2183
+
2184
+
2185
+
2186
+
2187
+
2188
+
2189
+
2190
+
2191
+
2192
+
2193
+
2194
+
2195
+
2196
+
2197
+
2198
+
2199
+
2200
+
2201
+
2202
+
2203
+
2204
+
2205
+
2206
+
2207
+
2208
+
2209
+
2210
+
2211
+
2212
+
2213
+
2214
+
2215
+
2216
+
2217
+
2218
+
2219
+
2220
+
2221
+
2222
+
2223
+
2224
+
2225
+
2226
+
2227
+
2228
+
2229
+
2230
+
2231
+
2232
+
2233
+
2234
+
2235
+
2236
+
2237
+
2238
+
2239
+
2240
+
2241
+
2242
+
2243
+
2244
+
2245
+
2246
+
2247
+
2248
+
2249
+
2250
+
2251
+
2252
+
2253
+
2254
+
2255
+
2256
+
2257
+
2258
+
2259
+
2260
+
2261
+
2262
+
2263
+
2264
+
2265
+
2266
+
2267
+
2268
+
2269
+
2270
+
2271
+
2272
+
2273
+
2274
+
2275
+
2276
+
2277
+
2278
+
2279
+
2280
+
2281
+
2282
+
2283
+
2284
+
2285
+
2286
+
2287
+
2288
+
2289
+
2290
+
2291
+
2292
+
2293
+
2294
+
2295
+
2296
+
2297
+
2298
+
2299
+
2300
+
2301
+
2302
+
2303
+
2304
+
2305
+
2306
+
2307
+
2308
+
2309
+
2310
+
2311
+
2312
+
2313
+
2314
+
2315
+
2316
+
2317
+
2318
+
2319
+
2320
+
2321
+
2322
+
2323
+
2324
+
2325
+
2326
+
2327
+
2328
+
2329
+
2330
+
2331
+
2332
+
2333
+
2334
+
2335
+
2336
+
2337
+
2338
+
2339
+
2340
+
2341
+
2342
+
2343
+
2344
+
2345
+
2346
+
2347
+
2348
+
2349
+
2350
+
2351
+
2352
+
2353
+
2354
+
2355
+
2356
+
2357
+
2358
+
2359
+
2360
+
2361
+
2362
+
2363
+
2364
+
2365
+
2366
+
2367
+
2368
+
2369
+
2370
+
2371
+
2372
+
2373
+
2374
+
2375
+
2376
+
2377
+
2378
+
2379
+
2380
+
2381
+
2382
+
2383
+
2384
+
2385
+
2386
+
2387
+
2388
+
2389
+
2390
+
2391
+
2392
+
2393
+
2394
+
2395
+
2396
+
2397
+
2398
+
2399
+
2400
+
2401
+
2402
+
2403
+
2404
+
2405
+
2406
+
2407
+
2408
+
2409
+
2410
+
2411
+
2412
+
2413
+
2414
+
2415
+
2416
+
2417
+
2418
+
2419
+
2420
+
2421
+
2422
+
2423
+
2424
+
2425
+
2426
+
2427
+
2428
+
2429
+
2430
+
2431
+
2432
+
2433
+
2434
+
2435
+
2436
+
2437
+
2438
+
2439
+
2440
+
2441
+
2442
+
2443
+
2444
+
2445
+
2446
+
2447
+
2448
+
2449
+
2450
+
2451
+
2452
+
2453
+
2454
+
2455
+
2456
+
2457
+
2458
+
2459
+
2460
+
2461
+
2462
+
2463
+
2464
+
2465
+
2466
+
2467
+
2468
+
2469
+
2470
+
2471
+
2472
+
2473
+
2474
+
2475
+
2476
+
2477
+
2478
+
2479
+
2480
+
2481
+
2482
+
2483
+
2484
+
2485
+
2486
+
2487
+
2488
+
2489
+
2490
+
2491
+
2492
+
2493
+
2494
+
2495
+
2496
+
2497
+
2498
+
2499
+
2500
+
2501
+
2502
+
2503
+
2504
+
2505
+
2506
+
2507
+
2508
+
2509
+
2510
+
2511
+
2512
+
2513
+
2514
+
2515
+
2516
+
2517
+
2518
+
2519
+
2520
+
2521
+
2522
+
2523
+
2524
+
2525
+
2526
+
2527
+
2528
+
2529
+
2530
+
2531
+
2532
+
2533
+
2534
+
2535
+
2536
+
2537
+
2538
+
2539
+
2540
+
2541
+
2542
+
2543
+
2544
+
2545
+ 𠮶
F5-TTS/data/librispeech_pc_test_clean_cross_sentence.lst ADDED
The diff for this file is too large to render. See raw diff
 
F5-TTS/src/f5_tts/api.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import sys
3
+ from importlib.resources import files
4
+
5
+ import soundfile as sf
6
+ import tqdm
7
+ from cached_path import cached_path
8
+ from hydra.utils import get_class
9
+ from omegaconf import OmegaConf
10
+
11
+ from f5_tts.infer.utils_infer import (
12
+ infer_process,
13
+ load_model,
14
+ load_vocoder,
15
+ preprocess_ref_audio_text,
16
+ remove_silence_for_generated_wav,
17
+ save_spectrogram,
18
+ transcribe,
19
+ )
20
+ from f5_tts.model.utils import seed_everything
21
+
22
+
23
+ class F5TTS:
24
+ def __init__(
25
+ self,
26
+ model="F5TTS_v1_Base",
27
+ ckpt_file="",
28
+ vocab_file="",
29
+ ode_method="euler",
30
+ use_ema=True,
31
+ vocoder_local_path=None,
32
+ device=None,
33
+ hf_cache_dir=None,
34
+ ):
35
+ model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
36
+ model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
37
+ model_arc = model_cfg.model.arch
38
+
39
+ self.mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
40
+ self.target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
41
+
42
+ self.ode_method = ode_method
43
+ self.use_ema = use_ema
44
+
45
+ if device is not None:
46
+ self.device = device
47
+ else:
48
+ import torch
49
+
50
+ self.device = (
51
+ "cuda"
52
+ if torch.cuda.is_available()
53
+ else "xpu"
54
+ if torch.xpu.is_available()
55
+ else "mps"
56
+ if torch.backends.mps.is_available()
57
+ else "cpu"
58
+ )
59
+
60
+ # Load models
61
+ self.vocoder = load_vocoder(
62
+ self.mel_spec_type, vocoder_local_path is not None, vocoder_local_path, self.device, hf_cache_dir
63
+ )
64
+
65
+ repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
66
+
67
+ # override for previous models
68
+ if model == "F5TTS_Base":
69
+ if self.mel_spec_type == "vocos":
70
+ ckpt_step = 1200000
71
+ elif self.mel_spec_type == "bigvgan":
72
+ model = "F5TTS_Base_bigvgan"
73
+ ckpt_type = "pt"
74
+ elif model == "E2TTS_Base":
75
+ repo_name = "E2-TTS"
76
+ ckpt_step = 1200000
77
+
78
+ if not ckpt_file:
79
+ ckpt_file = str(
80
+ cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}", cache_dir=hf_cache_dir)
81
+ )
82
+ self.ema_model = load_model(
83
+ model_cls, model_arc, ckpt_file, self.mel_spec_type, vocab_file, self.ode_method, self.use_ema, self.device
84
+ )
85
+
86
+ def transcribe(self, ref_audio, language=None):
87
+ return transcribe(ref_audio, language)
88
+
89
+ def export_wav(self, wav, file_wave, remove_silence=False):
90
+ sf.write(file_wave, wav, self.target_sample_rate)
91
+
92
+ if remove_silence:
93
+ remove_silence_for_generated_wav(file_wave)
94
+
95
+ def export_spectrogram(self, spec, file_spec):
96
+ save_spectrogram(spec, file_spec)
97
+
98
+ def infer(
99
+ self,
100
+ ref_file,
101
+ ref_text,
102
+ gen_text,
103
+ show_info=print,
104
+ progress=tqdm,
105
+ target_rms=0.1,
106
+ cross_fade_duration=0.15,
107
+ sway_sampling_coef=-1,
108
+ cfg_strength=2,
109
+ nfe_step=32,
110
+ speed=1.0,
111
+ fix_duration=None,
112
+ remove_silence=False,
113
+ file_wave=None,
114
+ file_spec=None,
115
+ seed=None,
116
+ ):
117
+ if seed is None:
118
+ seed = random.randint(0, sys.maxsize)
119
+ seed_everything(seed)
120
+ self.seed = seed
121
+
122
+ ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text)
123
+
124
+ wav, sr, spec = infer_process(
125
+ ref_file,
126
+ ref_text,
127
+ gen_text,
128
+ self.ema_model,
129
+ self.vocoder,
130
+ self.mel_spec_type,
131
+ show_info=show_info,
132
+ progress=progress,
133
+ target_rms=target_rms,
134
+ cross_fade_duration=cross_fade_duration,
135
+ nfe_step=nfe_step,
136
+ cfg_strength=cfg_strength,
137
+ sway_sampling_coef=sway_sampling_coef,
138
+ speed=speed,
139
+ fix_duration=fix_duration,
140
+ device=self.device,
141
+ )
142
+
143
+ if file_wave is not None:
144
+ self.export_wav(wav, file_wave, remove_silence)
145
+
146
+ if file_spec is not None:
147
+ self.export_spectrogram(spec, file_spec)
148
+
149
+ return wav, sr, spec
150
+
151
+
152
+ if __name__ == "__main__":
153
+ f5tts = F5TTS()
154
+
155
+ wav, sr, spec = f5tts.infer(
156
+ ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
157
+ ref_text="some call me nature, others call me mother nature.",
158
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
159
+ file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
160
+ file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
161
+ seed=None,
162
+ )
163
+
164
+ print("seed :", f5tts.seed)
F5-TTS/src/f5_tts/configs/E2TTS_Base.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: E2TTS_Base
22
+ tokenizer: pinyin
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: UNetT
25
+ arch:
26
+ dim: 1024
27
+ depth: 24
28
+ heads: 16
29
+ ff_mult: 4
30
+ text_mask_padding: False
31
+ pe_attn_head: 1
32
+ mel_spec:
33
+ target_sample_rate: 24000
34
+ n_mel_channels: 100
35
+ hop_length: 256
36
+ win_length: 1024
37
+ n_fft: 1024
38
+ mel_spec_type: vocos # vocos | bigvgan
39
+ vocoder:
40
+ is_local: False # use local offline ckpt or not
41
+ local_path: null # local vocoder path
42
+
43
+ ckpts:
44
+ logger: wandb # wandb | tensorboard | null
45
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
46
+ save_per_updates: 50000 # save checkpoint per updates
47
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
48
+ last_per_updates: 5000 # save last checkpoint per updates
49
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/E2TTS_Small.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0
18
+ bnb_optimizer: False
19
+
20
+ model:
21
+ name: E2TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: UNetT
25
+ arch:
26
+ dim: 768
27
+ depth: 20
28
+ heads: 12
29
+ ff_mult: 4
30
+ text_mask_padding: False
31
+ pe_attn_head: 1
32
+ mel_spec:
33
+ target_sample_rate: 24000
34
+ n_mel_channels: 100
35
+ hop_length: 256
36
+ win_length: 1024
37
+ n_fft: 1024
38
+ mel_spec_type: vocos # vocos | bigvgan
39
+ vocoder:
40
+ is_local: False # use local offline ckpt or not
41
+ local_path: null # local vocoder path
42
+
43
+ ckpts:
44
+ logger: wandb # wandb | tensorboard | null
45
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
46
+ save_per_updates: 50000 # save checkpoint per updates
47
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
48
+ last_per_updates: 5000 # save last checkpoint per updates
49
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/F5TTS_Base.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 1024
27
+ depth: 22
28
+ heads: 16
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: False
32
+ conv_layers: 4
33
+ pe_attn_head: 1
34
+ attn_backend: torch # torch | flash_attn
35
+ attn_mask_enabled: False
36
+ checkpoint_activations: False # recompute activations and save memory for extra compute
37
+ mel_spec:
38
+ target_sample_rate: 24000
39
+ n_mel_channels: 100
40
+ hop_length: 256
41
+ win_length: 1024
42
+ n_fft: 1024
43
+ mel_spec_type: vocos # vocos | bigvgan
44
+ vocoder:
45
+ is_local: False # use local offline ckpt or not
46
+ local_path: null # local vocoder path
47
+
48
+ ckpts:
49
+ logger: wandb # wandb | tensorboard | null
50
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
51
+ save_per_updates: 50000 # save checkpoint per updates
52
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
53
+ last_per_updates: 5000 # save last checkpoint per updates
54
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/F5TTS_Small.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11 # only suitable for Emilia, if you want to train it on LibriTTS, set epoch 686
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 768
27
+ depth: 18
28
+ heads: 12
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: False
32
+ conv_layers: 4
33
+ pe_attn_head: 1
34
+ attn_backend: torch # torch | flash_attn
35
+ attn_mask_enabled: False
36
+ checkpoint_activations: False # recompute activations and save memory for extra compute
37
+ mel_spec:
38
+ target_sample_rate: 24000
39
+ n_mel_channels: 100
40
+ hop_length: 256
41
+ win_length: 1024
42
+ n_fft: 1024
43
+ mel_spec_type: vocos # vocos | bigvgan
44
+ vocoder:
45
+ is_local: False # use local offline ckpt or not
46
+ local_path: null # local vocoder path
47
+
48
+ ckpts:
49
+ logger: wandb # wandb | tensorboard | null
50
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
51
+ save_per_updates: 50000 # save checkpoint per updates
52
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
53
+ last_per_updates: 5000 # save last checkpoint per updates
54
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_v1_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 1024
27
+ depth: 22
28
+ heads: 16
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: True
32
+ qk_norm: null # null | rms_norm
33
+ conv_layers: 4
34
+ pe_attn_head: null
35
+ attn_backend: torch # torch | flash_attn
36
+ attn_mask_enabled: False
37
+ checkpoint_activations: False # recompute activations and save memory for extra compute
38
+ mel_spec:
39
+ target_sample_rate: 24000
40
+ n_mel_channels: 100
41
+ hop_length: 256
42
+ win_length: 1024
43
+ n_fft: 1024
44
+ mel_spec_type: vocos # vocos | bigvgan
45
+ vocoder:
46
+ is_local: False # use local offline ckpt or not
47
+ local_path: null # local vocoder path
48
+
49
+ ckpts:
50
+ logger: wandb # wandb | tensorboard | null
51
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
52
+ save_per_updates: 50000 # save checkpoint per updates
53
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
54
+ last_per_updates: 5000 # save last checkpoint per updates
55
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/eval/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Evaluation
3
+
4
+ Install packages for evaluation:
5
+
6
+ ```bash
7
+ pip install -e .[eval]
8
+ ```
9
+
10
+ ## Generating Samples for Evaluation
11
+
12
+ ### Prepare Test Datasets
13
+
14
+ 1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
15
+ 2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
16
+ 3. Unzip the downloaded datasets and place them in the `data/` directory.
17
+ 4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
18
+ 5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
19
+
20
+ ### Batch Inference for Test Set
21
+
22
+ To run batch inference for evaluations, execute the following commands:
23
+
24
+ ```bash
25
+ # batch inference for evaluations
26
+ accelerate config # if not set before
27
+ bash src/f5_tts/eval/eval_infer_batch.sh
28
+ ```
29
+
30
+ ## Objective Evaluation on Generated Results
31
+
32
+ ### Download Evaluation Model Checkpoints
33
+
34
+ 1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
35
+ 2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
36
+ 3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
37
+
38
+ Then update in the following scripts with the paths you put evaluation model ckpts to.
39
+
40
+ ### Objective Evaluation
41
+
42
+ Update the path with your batch-inferenced results, and carry out WER / SIM / UTMOS evaluations:
43
+ ```bash
44
+ # Evaluation [WER] for Seed-TTS test [ZH] set
45
+ python src/f5_tts/eval/eval_seedtts_testset.py --eval_task wer --lang zh --gen_wav_dir <GEN_WAV_DIR> --gpu_nums 8
46
+
47
+ # Evaluation [SIM] for LibriSpeech-PC test-clean (cross-sentence)
48
+ python src/f5_tts/eval/eval_librispeech_test_clean.py --eval_task sim --gen_wav_dir <GEN_WAV_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
49
+
50
+ # Evaluation [UTMOS]. --ext: Audio extension
51
+ python src/f5_tts/eval/eval_utmos.py --audio_dir <WAV_DIR> --ext wav
52
+ ```
F5-TTS/src/f5_tts/eval/ecapa_tdnn.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # just for speaker similarity evaluation, third-party code
2
+
3
+ # From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
4
+ # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
5
+
6
+ import os
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+
13
+ """ Res2Conv1d + BatchNorm1d + ReLU
14
+ """
15
+
16
+
17
+ class Res2Conv1dReluBn(nn.Module):
18
+ """
19
+ in_channels == out_channels == channels
20
+ """
21
+
22
+ def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
23
+ super().__init__()
24
+ assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
25
+ self.scale = scale
26
+ self.width = channels // scale
27
+ self.nums = scale if scale == 1 else scale - 1
28
+
29
+ self.convs = []
30
+ self.bns = []
31
+ for i in range(self.nums):
32
+ self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
33
+ self.bns.append(nn.BatchNorm1d(self.width))
34
+ self.convs = nn.ModuleList(self.convs)
35
+ self.bns = nn.ModuleList(self.bns)
36
+
37
+ def forward(self, x):
38
+ out = []
39
+ spx = torch.split(x, self.width, 1)
40
+ for i in range(self.nums):
41
+ if i == 0:
42
+ sp = spx[i]
43
+ else:
44
+ sp = sp + spx[i]
45
+ # Order: conv -> relu -> bn
46
+ sp = self.convs[i](sp)
47
+ sp = self.bns[i](F.relu(sp))
48
+ out.append(sp)
49
+ if self.scale != 1:
50
+ out.append(spx[self.nums])
51
+ out = torch.cat(out, dim=1)
52
+
53
+ return out
54
+
55
+
56
+ """ Conv1d + BatchNorm1d + ReLU
57
+ """
58
+
59
+
60
+ class Conv1dReluBn(nn.Module):
61
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
62
+ super().__init__()
63
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
64
+ self.bn = nn.BatchNorm1d(out_channels)
65
+
66
+ def forward(self, x):
67
+ return self.bn(F.relu(self.conv(x)))
68
+
69
+
70
+ """ The SE connection of 1D case.
71
+ """
72
+
73
+
74
+ class SE_Connect(nn.Module):
75
+ def __init__(self, channels, se_bottleneck_dim=128):
76
+ super().__init__()
77
+ self.linear1 = nn.Linear(channels, se_bottleneck_dim)
78
+ self.linear2 = nn.Linear(se_bottleneck_dim, channels)
79
+
80
+ def forward(self, x):
81
+ out = x.mean(dim=2)
82
+ out = F.relu(self.linear1(out))
83
+ out = torch.sigmoid(self.linear2(out))
84
+ out = x * out.unsqueeze(2)
85
+
86
+ return out
87
+
88
+
89
+ """ SE-Res2Block of the ECAPA-TDNN architecture.
90
+ """
91
+
92
+ # def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
93
+ # return nn.Sequential(
94
+ # Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
95
+ # Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
96
+ # Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
97
+ # SE_Connect(channels)
98
+ # )
99
+
100
+
101
+ class SE_Res2Block(nn.Module):
102
+ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
103
+ super().__init__()
104
+ self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
105
+ self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
106
+ self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
107
+ self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
108
+
109
+ self.shortcut = None
110
+ if in_channels != out_channels:
111
+ self.shortcut = nn.Conv1d(
112
+ in_channels=in_channels,
113
+ out_channels=out_channels,
114
+ kernel_size=1,
115
+ )
116
+
117
+ def forward(self, x):
118
+ residual = x
119
+ if self.shortcut:
120
+ residual = self.shortcut(x)
121
+
122
+ x = self.Conv1dReluBn1(x)
123
+ x = self.Res2Conv1dReluBn(x)
124
+ x = self.Conv1dReluBn2(x)
125
+ x = self.SE_Connect(x)
126
+
127
+ return x + residual
128
+
129
+
130
+ """ Attentive weighted mean and standard deviation pooling.
131
+ """
132
+
133
+
134
+ class AttentiveStatsPool(nn.Module):
135
+ def __init__(self, in_dim, attention_channels=128, global_context_att=False):
136
+ super().__init__()
137
+ self.global_context_att = global_context_att
138
+
139
+ # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
140
+ if global_context_att:
141
+ self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
142
+ else:
143
+ self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
144
+ self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
145
+
146
+ def forward(self, x):
147
+ if self.global_context_att:
148
+ context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
149
+ context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
150
+ x_in = torch.cat((x, context_mean, context_std), dim=1)
151
+ else:
152
+ x_in = x
153
+
154
+ # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
155
+ alpha = torch.tanh(self.linear1(x_in))
156
+ # alpha = F.relu(self.linear1(x_in))
157
+ alpha = torch.softmax(self.linear2(alpha), dim=2)
158
+ mean = torch.sum(alpha * x, dim=2)
159
+ residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
160
+ std = torch.sqrt(residuals.clamp(min=1e-9))
161
+ return torch.cat([mean, std], dim=1)
162
+
163
+
164
+ class ECAPA_TDNN(nn.Module):
165
+ def __init__(
166
+ self,
167
+ feat_dim=80,
168
+ channels=512,
169
+ emb_dim=192,
170
+ global_context_att=False,
171
+ feat_type="wavlm_large",
172
+ sr=16000,
173
+ feature_selection="hidden_states",
174
+ update_extract=False,
175
+ config_path=None,
176
+ ):
177
+ super().__init__()
178
+
179
+ self.feat_type = feat_type
180
+ self.feature_selection = feature_selection
181
+ self.update_extract = update_extract
182
+ self.sr = sr
183
+
184
+ torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
185
+ try:
186
+ local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
187
+ self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
188
+ except: # noqa: E722
189
+ self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
190
+
191
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
192
+ self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
193
+ ):
194
+ self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
195
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
196
+ self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
197
+ ):
198
+ self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
199
+
200
+ self.feat_num = self.get_feat_num()
201
+ self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
202
+
203
+ if feat_type != "fbank" and feat_type != "mfcc":
204
+ freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
205
+ for name, param in self.feature_extract.named_parameters():
206
+ for freeze_val in freeze_list:
207
+ if freeze_val in name:
208
+ param.requires_grad = False
209
+ break
210
+
211
+ if not self.update_extract:
212
+ for param in self.feature_extract.parameters():
213
+ param.requires_grad = False
214
+
215
+ self.instance_norm = nn.InstanceNorm1d(feat_dim)
216
+ # self.channels = [channels] * 4 + [channels * 3]
217
+ self.channels = [channels] * 4 + [1536]
218
+
219
+ self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
220
+ self.layer2 = SE_Res2Block(
221
+ self.channels[0],
222
+ self.channels[1],
223
+ kernel_size=3,
224
+ stride=1,
225
+ padding=2,
226
+ dilation=2,
227
+ scale=8,
228
+ se_bottleneck_dim=128,
229
+ )
230
+ self.layer3 = SE_Res2Block(
231
+ self.channels[1],
232
+ self.channels[2],
233
+ kernel_size=3,
234
+ stride=1,
235
+ padding=3,
236
+ dilation=3,
237
+ scale=8,
238
+ se_bottleneck_dim=128,
239
+ )
240
+ self.layer4 = SE_Res2Block(
241
+ self.channels[2],
242
+ self.channels[3],
243
+ kernel_size=3,
244
+ stride=1,
245
+ padding=4,
246
+ dilation=4,
247
+ scale=8,
248
+ se_bottleneck_dim=128,
249
+ )
250
+
251
+ # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
252
+ cat_channels = channels * 3
253
+ self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
254
+ self.pooling = AttentiveStatsPool(
255
+ self.channels[-1], attention_channels=128, global_context_att=global_context_att
256
+ )
257
+ self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
258
+ self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
259
+
260
+ def get_feat_num(self):
261
+ self.feature_extract.eval()
262
+ wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
263
+ with torch.no_grad():
264
+ features = self.feature_extract(wav)
265
+ select_feature = features[self.feature_selection]
266
+ if isinstance(select_feature, (list, tuple)):
267
+ return len(select_feature)
268
+ else:
269
+ return 1
270
+
271
+ def get_feat(self, x):
272
+ if self.update_extract:
273
+ x = self.feature_extract([sample for sample in x])
274
+ else:
275
+ with torch.no_grad():
276
+ if self.feat_type == "fbank" or self.feat_type == "mfcc":
277
+ x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
278
+ else:
279
+ x = self.feature_extract([sample for sample in x])
280
+
281
+ if self.feat_type == "fbank":
282
+ x = x.log()
283
+
284
+ if self.feat_type != "fbank" and self.feat_type != "mfcc":
285
+ x = x[self.feature_selection]
286
+ if isinstance(x, (list, tuple)):
287
+ x = torch.stack(x, dim=0)
288
+ else:
289
+ x = x.unsqueeze(0)
290
+ norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
291
+ x = (norm_weights * x).sum(dim=0)
292
+ x = torch.transpose(x, 1, 2) + 1e-6
293
+
294
+ x = self.instance_norm(x)
295
+ return x
296
+
297
+ def forward(self, x):
298
+ x = self.get_feat(x)
299
+
300
+ out1 = self.layer1(x)
301
+ out2 = self.layer2(out1)
302
+ out3 = self.layer3(out2)
303
+ out4 = self.layer4(out3)
304
+
305
+ out = torch.cat([out2, out3, out4], dim=1)
306
+ out = F.relu(self.conv(out))
307
+ out = self.bn(self.pooling(out))
308
+ out = self.linear(out)
309
+
310
+ return out
311
+
312
+
313
+ def ECAPA_TDNN_SMALL(
314
+ feat_dim,
315
+ emb_dim=256,
316
+ feat_type="wavlm_large",
317
+ sr=16000,
318
+ feature_selection="hidden_states",
319
+ update_extract=False,
320
+ config_path=None,
321
+ ):
322
+ return ECAPA_TDNN(
323
+ feat_dim=feat_dim,
324
+ channels=512,
325
+ emb_dim=emb_dim,
326
+ feat_type=feat_type,
327
+ sr=sr,
328
+ feature_selection=feature_selection,
329
+ update_extract=update_extract,
330
+ config_path=config_path,
331
+ )
F5-TTS/src/f5_tts/eval/eval_infer_batch.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+
5
+ sys.path.append(os.getcwd())
6
+
7
+ import argparse
8
+ import time
9
+ from importlib.resources import files
10
+
11
+ import torch
12
+ import torchaudio
13
+ from accelerate import Accelerator
14
+ from hydra.utils import get_class
15
+ from omegaconf import OmegaConf
16
+ from tqdm import tqdm
17
+
18
+ from f5_tts.eval.utils_eval import (
19
+ get_inference_prompt,
20
+ get_librispeech_test_clean_metainfo,
21
+ get_seedtts_testset_metainfo,
22
+ )
23
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
24
+ from f5_tts.model import CFM
25
+ from f5_tts.model.utils import get_tokenizer
26
+
27
+
28
+ accelerator = Accelerator()
29
+ device = f"cuda:{accelerator.process_index}"
30
+
31
+
32
+ use_ema = True
33
+ target_rms = 0.1
34
+
35
+
36
+ rel_path = str(files("f5_tts").joinpath("../../"))
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="batch inference")
41
+
42
+ parser.add_argument("-s", "--seed", default=None, type=int)
43
+ parser.add_argument("-n", "--expname", required=True)
44
+ parser.add_argument("-c", "--ckptstep", default=1250000, type=int)
45
+
46
+ parser.add_argument("-nfe", "--nfestep", default=32, type=int)
47
+ parser.add_argument("-o", "--odemethod", default="euler")
48
+ parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
49
+
50
+ parser.add_argument("-t", "--testset", required=True)
51
+
52
+ args = parser.parse_args()
53
+
54
+ seed = args.seed
55
+ exp_name = args.expname
56
+ ckpt_step = args.ckptstep
57
+
58
+ nfe_step = args.nfestep
59
+ ode_method = args.odemethod
60
+ sway_sampling_coef = args.swaysampling
61
+
62
+ testset = args.testset
63
+
64
+ infer_batch_size = 1 # max frames. 1 for ddp single inference (recommended)
65
+ cfg_strength = 2.0
66
+ speed = 1.0
67
+ use_truth_duration = False
68
+ no_ref_audio = False
69
+
70
+ model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml")))
71
+ model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
72
+ model_arc = model_cfg.model.arch
73
+
74
+ dataset_name = model_cfg.datasets.name
75
+ tokenizer = model_cfg.model.tokenizer
76
+
77
+ mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
78
+ target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
79
+ n_mel_channels = model_cfg.model.mel_spec.n_mel_channels
80
+ hop_length = model_cfg.model.mel_spec.hop_length
81
+ win_length = model_cfg.model.mel_spec.win_length
82
+ n_fft = model_cfg.model.mel_spec.n_fft
83
+
84
+ if testset == "ls_pc_test_clean":
85
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
86
+ librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
87
+ metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
88
+
89
+ elif testset == "seedtts_test_zh":
90
+ metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
91
+ metainfo = get_seedtts_testset_metainfo(metalst)
92
+
93
+ elif testset == "seedtts_test_en":
94
+ metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
95
+ metainfo = get_seedtts_testset_metainfo(metalst)
96
+
97
+ # path to save genereted wavs
98
+ output_dir = (
99
+ f"{rel_path}/"
100
+ f"results/{exp_name}_{ckpt_step}/{testset}/"
101
+ f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
102
+ f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
103
+ f"_cfg{cfg_strength}_speed{speed}"
104
+ f"{'_gt-dur' if use_truth_duration else ''}"
105
+ f"{'_no-ref-audio' if no_ref_audio else ''}"
106
+ )
107
+
108
+ # -------------------------------------------------#
109
+
110
+ prompts_all = get_inference_prompt(
111
+ metainfo,
112
+ speed=speed,
113
+ tokenizer=tokenizer,
114
+ target_sample_rate=target_sample_rate,
115
+ n_mel_channels=n_mel_channels,
116
+ hop_length=hop_length,
117
+ mel_spec_type=mel_spec_type,
118
+ target_rms=target_rms,
119
+ use_truth_duration=use_truth_duration,
120
+ infer_batch_size=infer_batch_size,
121
+ )
122
+
123
+ # Vocoder model
124
+ local = False
125
+ if mel_spec_type == "vocos":
126
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
127
+ elif mel_spec_type == "bigvgan":
128
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
129
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
130
+
131
+ # Tokenizer
132
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
133
+
134
+ # Model
135
+ model = CFM(
136
+ transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
137
+ mel_spec_kwargs=dict(
138
+ n_fft=n_fft,
139
+ hop_length=hop_length,
140
+ win_length=win_length,
141
+ n_mel_channels=n_mel_channels,
142
+ target_sample_rate=target_sample_rate,
143
+ mel_spec_type=mel_spec_type,
144
+ ),
145
+ odeint_kwargs=dict(
146
+ method=ode_method,
147
+ ),
148
+ vocab_char_map=vocab_char_map,
149
+ ).to(device)
150
+
151
+ ckpt_prefix = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}"
152
+ if os.path.exists(ckpt_prefix + ".pt"):
153
+ ckpt_path = ckpt_prefix + ".pt"
154
+ elif os.path.exists(ckpt_prefix + ".safetensors"):
155
+ ckpt_path = ckpt_prefix + ".safetensors"
156
+ else:
157
+ print("Loading from self-organized training checkpoints rather than released pretrained.")
158
+ ckpt_path = rel_path + f"/{model_cfg.ckpts.save_dir}/model_{ckpt_step}.pt"
159
+
160
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
161
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
162
+
163
+ if not os.path.exists(output_dir) and accelerator.is_main_process:
164
+ os.makedirs(output_dir)
165
+
166
+ # start batch inference
167
+ accelerator.wait_for_everyone()
168
+ start = time.time()
169
+
170
+ with accelerator.split_between_processes(prompts_all) as prompts:
171
+ for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
172
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
173
+ ref_mels = ref_mels.to(device)
174
+ ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
175
+ total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
176
+
177
+ # Inference
178
+ with torch.inference_mode():
179
+ generated, _ = model.sample(
180
+ cond=ref_mels,
181
+ text=final_text_list,
182
+ duration=total_mel_lens,
183
+ lens=ref_mel_lens,
184
+ steps=nfe_step,
185
+ cfg_strength=cfg_strength,
186
+ sway_sampling_coef=sway_sampling_coef,
187
+ no_ref_audio=no_ref_audio,
188
+ seed=seed,
189
+ )
190
+ # Final result
191
+ for i, gen in enumerate(generated):
192
+ gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
193
+ gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
194
+ if mel_spec_type == "vocos":
195
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
196
+ elif mel_spec_type == "bigvgan":
197
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
198
+
199
+ if ref_rms_list[i] < target_rms:
200
+ generated_wave = generated_wave * ref_rms_list[i] / target_rms
201
+ torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
202
+
203
+ accelerator.wait_for_everyone()
204
+ if accelerator.is_main_process:
205
+ timediff = time.time() - start
206
+ print(f"Done batch inference in {timediff / 60:.2f} minutes.")
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main()
F5-TTS/src/f5_tts/eval/eval_infer_batch.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # e.g. F5-TTS, 16 NFE
4
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_zh" -nfe 16
5
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_en" -nfe 16
6
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "ls_pc_test_clean" -nfe 16
7
+
8
+ # e.g. Vanilla E2 TTS, 32 NFE
9
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_zh" -o "midpoint" -ss 0
10
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_en" -o "midpoint" -ss 0
11
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "ls_pc_test_clean" -o "midpoint" -ss 0
12
+
13
+ # e.g. evaluate F5-TTS 16 NFE result on Seed-TTS test-zh
14
+ python src/f5_tts/eval/eval_seedtts_testset.py -e wer -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
15
+ python src/f5_tts/eval/eval_seedtts_testset.py -e sim -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
16
+ python src/f5_tts/eval/eval_utmos.py --audio_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0
17
+
18
+ # etc.
F5-TTS/src/f5_tts/eval/eval_librispeech_test_clean.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+
9
+ sys.path.append(os.getcwd())
10
+
11
+ import multiprocessing as mp
12
+ from importlib.resources import files
13
+
14
+ import numpy as np
15
+
16
+ from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
17
+
18
+
19
+ rel_path = str(files("f5_tts").joinpath("../../"))
20
+
21
+
22
+ def get_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
25
+ parser.add_argument("-l", "--lang", type=str, default="en")
26
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
27
+ parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
28
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
29
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
30
+ return parser.parse_args()
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+ eval_task = args.eval_task
36
+ lang = args.lang
37
+ librispeech_test_clean_path = args.librispeech_test_clean_path # test-clean path
38
+ gen_wav_dir = args.gen_wav_dir
39
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
40
+
41
+ gpus = list(range(args.gpu_nums))
42
+ test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
43
+
44
+ ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
45
+ ## leading to a low similarity for the ground truth in some cases.
46
+ # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
47
+
48
+ local = args.local
49
+ if local: # use local custom checkpoint dir
50
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
51
+ else:
52
+ asr_ckpt_dir = "" # auto download to cache dir
53
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
54
+
55
+ # --------------------------------------------------------------------------
56
+
57
+ full_results = []
58
+ metrics = []
59
+
60
+ if eval_task == "wer":
61
+ with mp.Pool(processes=len(gpus)) as pool:
62
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
63
+ results = pool.map(run_asr_wer, args)
64
+ for r in results:
65
+ full_results.extend(r)
66
+ elif eval_task == "sim":
67
+ with mp.Pool(processes=len(gpus)) as pool:
68
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
69
+ results = pool.map(run_sim, args)
70
+ for r in results:
71
+ full_results.extend(r)
72
+ else:
73
+ raise ValueError(f"Unknown metric type: {eval_task}")
74
+
75
+ result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
76
+ with open(result_path, "w") as f:
77
+ for line in full_results:
78
+ metrics.append(line[eval_task])
79
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
80
+ metric = round(np.mean(metrics), 5)
81
+ f.write(f"\n{eval_task.upper()}: {metric}\n")
82
+
83
+ print(f"\nTotal {len(metrics)} samples")
84
+ print(f"{eval_task.upper()}: {metric}")
85
+ print(f"{eval_task.upper()} results saved to {result_path}")
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()
F5-TTS/src/f5_tts/eval/eval_seedtts_testset.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Seed-TTS testset
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+
9
+ sys.path.append(os.getcwd())
10
+
11
+ import multiprocessing as mp
12
+ from importlib.resources import files
13
+
14
+ import numpy as np
15
+
16
+ from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
17
+
18
+
19
+ rel_path = str(files("f5_tts").joinpath("../../"))
20
+
21
+
22
+ def get_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
25
+ parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
26
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
27
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
28
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
29
+ return parser.parse_args()
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+ eval_task = args.eval_task
35
+ lang = args.lang
36
+ gen_wav_dir = args.gen_wav_dir
37
+ metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
38
+
39
+ # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
40
+ # zh 1.254 seems a result of 4 workers wer_seed_tts
41
+ gpus = list(range(args.gpu_nums))
42
+ test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
43
+
44
+ local = args.local
45
+ if local: # use local custom checkpoint dir
46
+ if lang == "zh":
47
+ asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
48
+ elif lang == "en":
49
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
50
+ else:
51
+ asr_ckpt_dir = "" # auto download to cache dir
52
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
53
+
54
+ # --------------------------------------------------------------------------
55
+
56
+ full_results = []
57
+ metrics = []
58
+
59
+ if eval_task == "wer":
60
+ with mp.Pool(processes=len(gpus)) as pool:
61
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
62
+ results = pool.map(run_asr_wer, args)
63
+ for r in results:
64
+ full_results.extend(r)
65
+ elif eval_task == "sim":
66
+ with mp.Pool(processes=len(gpus)) as pool:
67
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
68
+ results = pool.map(run_sim, args)
69
+ for r in results:
70
+ full_results.extend(r)
71
+ else:
72
+ raise ValueError(f"Unknown metric type: {eval_task}")
73
+
74
+ result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
75
+ with open(result_path, "w") as f:
76
+ for line in full_results:
77
+ metrics.append(line[eval_task])
78
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
79
+ metric = round(np.mean(metrics), 5)
80
+ f.write(f"\n{eval_task.upper()}: {metric}\n")
81
+
82
+ print(f"\nTotal {len(metrics)} samples")
83
+ print(f"{eval_task.upper()}: {metric}")
84
+ print(f"{eval_task.upper()} results saved to {result_path}")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
F5-TTS/src/f5_tts/eval/eval_utmos.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import librosa
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(description="UTMOS Evaluation")
12
+ parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
13
+ parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
14
+ args = parser.parse_args()
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu"
17
+
18
+ predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
19
+ predictor = predictor.to(device)
20
+
21
+ audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
22
+ utmos_score = 0
23
+
24
+ utmos_result_path = Path(args.audio_dir) / "_utmos_results.jsonl"
25
+ with open(utmos_result_path, "w", encoding="utf-8") as f:
26
+ for audio_path in tqdm(audio_paths, desc="Processing"):
27
+ wav, sr = librosa.load(audio_path, sr=None, mono=True)
28
+ wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
29
+ score = predictor(wav_tensor, sr)
30
+ line = {}
31
+ line["wav"], line["utmos"] = str(audio_path.stem), score.item()
32
+ utmos_score += score.item()
33
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
34
+ avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
35
+ f.write(f"\nUTMOS: {avg_score:.4f}\n")
36
+
37
+ print(f"UTMOS: {avg_score:.4f}")
38
+ print(f"UTMOS results saved to {utmos_result_path}")
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
F5-TTS/src/f5_tts/eval/utils_eval.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import string
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+
12
+ from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
13
+ from f5_tts.model.modules import MelSpec
14
+ from f5_tts.model.utils import convert_char_to_pinyin
15
+
16
+
17
+ # seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
18
+ def get_seedtts_testset_metainfo(metalst):
19
+ f = open(metalst)
20
+ lines = f.readlines()
21
+ f.close()
22
+ metainfo = []
23
+ for line in lines:
24
+ if len(line.strip().split("|")) == 5:
25
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
26
+ elif len(line.strip().split("|")) == 4:
27
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
28
+ gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
29
+ if not os.path.isabs(prompt_wav):
30
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
31
+ metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
32
+ return metainfo
33
+
34
+
35
+ # librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
36
+ def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
37
+ f = open(metalst)
38
+ lines = f.readlines()
39
+ f.close()
40
+ metainfo = []
41
+ for line in lines:
42
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
43
+
44
+ # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
45
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
46
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
47
+
48
+ # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
49
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
50
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
51
+
52
+ metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
53
+
54
+ return metainfo
55
+
56
+
57
+ # padded to max length mel batch
58
+ def padded_mel_batch(ref_mels):
59
+ max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
60
+ padded_ref_mels = []
61
+ for mel in ref_mels:
62
+ padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
63
+ padded_ref_mels.append(padded_ref_mel)
64
+ padded_ref_mels = torch.stack(padded_ref_mels)
65
+ padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
66
+ return padded_ref_mels
67
+
68
+
69
+ # get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
70
+
71
+
72
+ def get_inference_prompt(
73
+ metainfo,
74
+ speed=1.0,
75
+ tokenizer="pinyin",
76
+ polyphone=True,
77
+ target_sample_rate=24000,
78
+ n_fft=1024,
79
+ win_length=1024,
80
+ n_mel_channels=100,
81
+ hop_length=256,
82
+ mel_spec_type="vocos",
83
+ target_rms=0.1,
84
+ use_truth_duration=False,
85
+ infer_batch_size=1,
86
+ num_buckets=200,
87
+ min_secs=3,
88
+ max_secs=40,
89
+ ):
90
+ prompts_all = []
91
+
92
+ min_tokens = min_secs * target_sample_rate // hop_length
93
+ max_tokens = max_secs * target_sample_rate // hop_length
94
+
95
+ batch_accum = [0] * num_buckets
96
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
97
+ [[] for _ in range(num_buckets)] for _ in range(6)
98
+ )
99
+
100
+ mel_spectrogram = MelSpec(
101
+ n_fft=n_fft,
102
+ hop_length=hop_length,
103
+ win_length=win_length,
104
+ n_mel_channels=n_mel_channels,
105
+ target_sample_rate=target_sample_rate,
106
+ mel_spec_type=mel_spec_type,
107
+ )
108
+
109
+ for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
110
+ # Audio
111
+ ref_audio, ref_sr = torchaudio.load(prompt_wav)
112
+ ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
113
+ if ref_rms < target_rms:
114
+ ref_audio = ref_audio * target_rms / ref_rms
115
+ assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
116
+ if ref_sr != target_sample_rate:
117
+ resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
118
+ ref_audio = resampler(ref_audio)
119
+
120
+ # Text
121
+ if len(prompt_text[-1].encode("utf-8")) == 1:
122
+ prompt_text = prompt_text + " "
123
+ text = [prompt_text + gt_text]
124
+ if tokenizer == "pinyin":
125
+ text_list = convert_char_to_pinyin(text, polyphone=polyphone)
126
+ else:
127
+ text_list = text
128
+
129
+ # to mel spectrogram
130
+ ref_mel = mel_spectrogram(ref_audio)
131
+ ref_mel = ref_mel.squeeze(0)
132
+
133
+ # Duration, mel frame length
134
+ ref_mel_len = ref_mel.shape[-1]
135
+
136
+ if use_truth_duration:
137
+ gt_audio, gt_sr = torchaudio.load(gt_wav)
138
+ if gt_sr != target_sample_rate:
139
+ resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
140
+ gt_audio = resampler(gt_audio)
141
+ total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
142
+
143
+ # # test vocoder resynthesis
144
+ # ref_audio = gt_audio
145
+ else:
146
+ ref_text_len = len(prompt_text.encode("utf-8"))
147
+ gen_text_len = len(gt_text.encode("utf-8"))
148
+ total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
149
+
150
+ # deal with batch
151
+ assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
152
+ assert min_tokens <= total_mel_len <= max_tokens, (
153
+ f"Audio {utt} has duration {total_mel_len * hop_length // target_sample_rate}s out of range [{min_secs}, {max_secs}]."
154
+ )
155
+ bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
156
+
157
+ utts[bucket_i].append(utt)
158
+ ref_rms_list[bucket_i].append(ref_rms)
159
+ ref_mels[bucket_i].append(ref_mel)
160
+ ref_mel_lens[bucket_i].append(ref_mel_len)
161
+ total_mel_lens[bucket_i].append(total_mel_len)
162
+ final_text_list[bucket_i].extend(text_list)
163
+
164
+ batch_accum[bucket_i] += total_mel_len
165
+
166
+ if batch_accum[bucket_i] >= infer_batch_size:
167
+ # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
168
+ prompts_all.append(
169
+ (
170
+ utts[bucket_i],
171
+ ref_rms_list[bucket_i],
172
+ padded_mel_batch(ref_mels[bucket_i]),
173
+ ref_mel_lens[bucket_i],
174
+ total_mel_lens[bucket_i],
175
+ final_text_list[bucket_i],
176
+ )
177
+ )
178
+ batch_accum[bucket_i] = 0
179
+ (
180
+ utts[bucket_i],
181
+ ref_rms_list[bucket_i],
182
+ ref_mels[bucket_i],
183
+ ref_mel_lens[bucket_i],
184
+ total_mel_lens[bucket_i],
185
+ final_text_list[bucket_i],
186
+ ) = [], [], [], [], [], []
187
+
188
+ # add residual
189
+ for bucket_i, bucket_frames in enumerate(batch_accum):
190
+ if bucket_frames > 0:
191
+ prompts_all.append(
192
+ (
193
+ utts[bucket_i],
194
+ ref_rms_list[bucket_i],
195
+ padded_mel_batch(ref_mels[bucket_i]),
196
+ ref_mel_lens[bucket_i],
197
+ total_mel_lens[bucket_i],
198
+ final_text_list[bucket_i],
199
+ )
200
+ )
201
+ # not only leave easy work for last workers
202
+ random.seed(666)
203
+ random.shuffle(prompts_all)
204
+
205
+ return prompts_all
206
+
207
+
208
+ # get wav_res_ref_text of seed-tts test metalst
209
+ # https://github.com/BytedanceSpeech/seed-tts-eval
210
+
211
+
212
+ def get_seed_tts_test(metalst, gen_wav_dir, gpus):
213
+ f = open(metalst)
214
+ lines = f.readlines()
215
+ f.close()
216
+
217
+ test_set_ = []
218
+ for line in tqdm(lines):
219
+ if len(line.strip().split("|")) == 5:
220
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
221
+ elif len(line.strip().split("|")) == 4:
222
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
223
+
224
+ if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
225
+ continue
226
+ gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
227
+ if not os.path.isabs(prompt_wav):
228
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
229
+
230
+ test_set_.append((gen_wav, prompt_wav, gt_text))
231
+
232
+ num_jobs = len(gpus)
233
+ if num_jobs == 1:
234
+ return [(gpus[0], test_set_)]
235
+
236
+ wav_per_job = len(test_set_) // num_jobs + 1
237
+ test_set = []
238
+ for i in range(num_jobs):
239
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
240
+
241
+ return test_set
242
+
243
+
244
+ # get librispeech test-clean cross sentence test
245
+
246
+
247
+ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
248
+ f = open(metalst)
249
+ lines = f.readlines()
250
+ f.close()
251
+
252
+ test_set_ = []
253
+ for line in tqdm(lines):
254
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
255
+
256
+ if eval_ground_truth:
257
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
258
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
259
+ else:
260
+ if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
261
+ raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
262
+ gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
263
+
264
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
265
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
266
+
267
+ test_set_.append((gen_wav, ref_wav, gen_txt))
268
+
269
+ num_jobs = len(gpus)
270
+ if num_jobs == 1:
271
+ return [(gpus[0], test_set_)]
272
+
273
+ wav_per_job = len(test_set_) // num_jobs + 1
274
+ test_set = []
275
+ for i in range(num_jobs):
276
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
277
+
278
+ return test_set
279
+
280
+
281
+ # load asr model
282
+
283
+
284
+ def load_asr_model(lang, ckpt_dir=""):
285
+ if lang == "zh":
286
+ from funasr import AutoModel
287
+
288
+ model = AutoModel(
289
+ model=os.path.join(ckpt_dir, "paraformer-zh"),
290
+ # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
291
+ # punc_model = os.path.join(ckpt_dir, "ct-punc"),
292
+ # spk_model = os.path.join(ckpt_dir, "cam++"),
293
+ disable_update=True,
294
+ ) # following seed-tts setting
295
+ elif lang == "en":
296
+ from faster_whisper import WhisperModel
297
+
298
+ model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
299
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
300
+ return model
301
+
302
+
303
+ # WER Evaluation, the way Seed-TTS does
304
+
305
+
306
+ def run_asr_wer(args):
307
+ rank, lang, test_set, ckpt_dir = args
308
+
309
+ if lang == "zh":
310
+ import zhconv
311
+
312
+ torch.cuda.set_device(rank)
313
+ elif lang == "en":
314
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
315
+ else:
316
+ raise NotImplementedError(
317
+ "lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now."
318
+ )
319
+
320
+ asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
321
+
322
+ from zhon.hanzi import punctuation
323
+
324
+ punctuation_all = punctuation + string.punctuation
325
+ wer_results = []
326
+
327
+ from jiwer import compute_measures
328
+
329
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
330
+ if lang == "zh":
331
+ res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
332
+ hypo = res[0]["text"]
333
+ hypo = zhconv.convert(hypo, "zh-cn")
334
+ elif lang == "en":
335
+ segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
336
+ hypo = ""
337
+ for segment in segments:
338
+ hypo = hypo + " " + segment.text
339
+
340
+ raw_truth = truth
341
+ raw_hypo = hypo
342
+
343
+ for x in punctuation_all:
344
+ truth = truth.replace(x, "")
345
+ hypo = hypo.replace(x, "")
346
+
347
+ truth = truth.replace(" ", " ")
348
+ hypo = hypo.replace(" ", " ")
349
+
350
+ if lang == "zh":
351
+ truth = " ".join([x for x in truth])
352
+ hypo = " ".join([x for x in hypo])
353
+ elif lang == "en":
354
+ truth = truth.lower()
355
+ hypo = hypo.lower()
356
+
357
+ measures = compute_measures(truth, hypo)
358
+ wer = measures["wer"]
359
+
360
+ # ref_list = truth.split(" ")
361
+ # subs = measures["substitutions"] / len(ref_list)
362
+ # dele = measures["deletions"] / len(ref_list)
363
+ # inse = measures["insertions"] / len(ref_list)
364
+
365
+ wer_results.append(
366
+ {
367
+ "wav": Path(gen_wav).stem,
368
+ "truth": raw_truth,
369
+ "hypo": raw_hypo,
370
+ "wer": wer,
371
+ }
372
+ )
373
+
374
+ return wer_results
375
+
376
+
377
+ # SIM Evaluation
378
+
379
+
380
+ def run_sim(args):
381
+ rank, test_set, ckpt_dir = args
382
+ device = f"cuda:{rank}"
383
+
384
+ model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
385
+ state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
386
+ model.load_state_dict(state_dict["model"], strict=False)
387
+
388
+ use_gpu = True if torch.cuda.is_available() else False
389
+ if use_gpu:
390
+ model = model.cuda(device)
391
+ model.eval()
392
+
393
+ sim_results = []
394
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
395
+ wav1, sr1 = torchaudio.load(gen_wav)
396
+ wav2, sr2 = torchaudio.load(prompt_wav)
397
+
398
+ resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
399
+ resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
400
+ wav1 = resample1(wav1)
401
+ wav2 = resample2(wav2)
402
+
403
+ if use_gpu:
404
+ wav1 = wav1.cuda(device)
405
+ wav2 = wav2.cuda(device)
406
+ with torch.no_grad():
407
+ emb1 = model(wav1)
408
+ emb2 = model(wav2)
409
+
410
+ sim = F.cosine_similarity(emb1, emb2)[0].item()
411
+ # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
412
+ sim_results.append(
413
+ {
414
+ "wav": Path(gen_wav).stem,
415
+ "sim": sim,
416
+ }
417
+ )
418
+
419
+ return sim_results
F5-TTS/src/f5_tts/infer/README.md ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inference
2
+
3
+ The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
+
5
+ **More checkpoints with whole community efforts can be found in [SHARED.md](SHARED.md), supporting more languages.**
6
+
7
+ Currently support **30s for a single** generation, which is the **total length** (same logic if `fix_duration`) including both prompt and output audio. However, `infer_cli` and `infer_gradio` will automatically do chunk generation for longer text. Long reference audio will be **clip short to ~12s**.
8
+
9
+ To avoid possible inference failures, make sure you have seen through the following instructions.
10
+
11
+ - Use reference audio <12s and leave proper silence space (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
12
+ - <ins>Uppercased letters</ins> (best with form like K.F.C.) will be uttered letter by letter, and lowercased letters used for common words.
13
+ - Add some spaces (blank: " ") or punctuations (e.g. "," ".") <ins>to explicitly introduce some pauses</ins>.
14
+ - If English punctuation marks the end of a sentence, make sure there is a space " " after it. Otherwise not regarded as when chunk.
15
+ - <ins>Preprocess numbers</ins> to Chinese letters if you want to have them read in Chinese, otherwise in English.
16
+ - If the generation output is blank (pure silence), <ins>check for FFmpeg installation</ins>.
17
+ - Try <ins>turn off `use_ema` if using an early-stage</ins> finetuned checkpoint (which goes just few updates).
18
+
19
+
20
+ ## Gradio App
21
+
22
+ Currently supported features:
23
+
24
+ - Basic TTS with Chunk Inference
25
+ - Multi-Style / Multi-Speaker Generation
26
+ - Voice Chat powered by Qwen2.5-3B-Instruct
27
+ - [Custom inference with more language support](SHARED.md)
28
+
29
+ The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
30
+
31
+ The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
32
+
33
+ More flags options:
34
+
35
+ ```bash
36
+ # Automatically launch the interface in the default web browser
37
+ f5-tts_infer-gradio --inbrowser
38
+
39
+ # Set the root path of the application, if it's not served from the root ("/") of the domain
40
+ # For example, if the application is served at "https://example.com/myapp"
41
+ f5-tts_infer-gradio --root_path "/myapp"
42
+ ```
43
+
44
+ Could also be used as a component for larger application:
45
+ ```python
46
+ import gradio as gr
47
+ from f5_tts.infer.infer_gradio import app
48
+
49
+ with gr.Blocks() as main_app:
50
+ gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
51
+
52
+ # ... other Gradio components
53
+
54
+ app.render()
55
+
56
+ main_app.launch()
57
+ ```
58
+
59
+
60
+ ## CLI Inference
61
+
62
+ The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
63
+
64
+ The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
65
+
66
+ For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
67
+
68
+ Basically you can inference with flags:
69
+ ```bash
70
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
71
+ f5-tts_infer-cli \
72
+ --model F5TTS_v1_Base \
73
+ --ref_audio "ref_audio.wav" \
74
+ --ref_text "The content, subtitle or transcription of reference audio." \
75
+ --gen_text "Some text you want TTS model generate for you."
76
+
77
+ # Use BigVGAN as vocoder. Currently only support F5TTS_Base.
78
+ f5-tts_infer-cli --model F5TTS_Base --vocoder_name bigvgan --load_vocoder_from_local
79
+
80
+ # Use custom path checkpoint, e.g.
81
+ f5-tts_infer-cli --ckpt_file ckpts/F5TTS_v1_Base/model_1250000.safetensors
82
+
83
+ # More instructions
84
+ f5-tts_infer-cli --help
85
+ ```
86
+
87
+ And a `.toml` file would help with more flexible usage.
88
+
89
+ ```bash
90
+ f5-tts_infer-cli -c custom.toml
91
+ ```
92
+
93
+ For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
94
+
95
+ ```toml
96
+ # F5TTS_v1_Base | E2TTS_Base
97
+ model = "F5TTS_v1_Base"
98
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
99
+ # If an empty "", transcribes the reference audio automatically.
100
+ ref_text = "Some call me nature, others call me mother nature."
101
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
102
+ # File with text to generate. Ignores the text above.
103
+ gen_file = ""
104
+ remove_silence = false
105
+ output_dir = "tests"
106
+ ```
107
+
108
+ You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
109
+
110
+ ```toml
111
+ # F5TTS_v1_Base | E2TTS_Base
112
+ model = "F5TTS_v1_Base"
113
+ ref_audio = "infer/examples/multi/main.flac"
114
+ # If an empty "", transcribes the reference audio automatically.
115
+ ref_text = ""
116
+ gen_text = ""
117
+ # File with text to generate. Ignores the text above.
118
+ gen_file = "infer/examples/multi/story.txt"
119
+ remove_silence = true
120
+ output_dir = "tests"
121
+
122
+ [voices.town]
123
+ ref_audio = "infer/examples/multi/town.flac"
124
+ ref_text = ""
125
+
126
+ [voices.country]
127
+ ref_audio = "infer/examples/multi/country.flac"
128
+ ref_text = ""
129
+ ```
130
+ You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
131
+
132
+ ## API Usage
133
+
134
+ ```python
135
+ from importlib.resources import files
136
+ from f5_tts.api import F5TTS
137
+
138
+ f5tts = F5TTS()
139
+ wav, sr, spec = f5tts.infer(
140
+ ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
141
+ ref_text="some call me nature, others call me mother nature.",
142
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
143
+ file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
144
+ file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
145
+ seed=None,
146
+ )
147
+ ```
148
+ Check [api.py](../api.py) for more details.
149
+
150
+ ## TensorRT-LLM Deployment
151
+
152
+ See [detailed instructions](../runtime/triton_trtllm/README.md) for more information.
153
+
154
+ ## Socket Real-time Service
155
+
156
+ Real-time voice output with chunk stream:
157
+
158
+ ```bash
159
+ # Start socket server
160
+ python src/f5_tts/socket_server.py
161
+
162
+ # If PyAudio not installed
163
+ sudo apt-get install portaudio19-dev
164
+ pip install pyaudio
165
+
166
+ # Communicate with socket client
167
+ python src/f5_tts/socket_client.py
168
+ ```
169
+
170
+ ## Speech Editing
171
+
172
+ To test speech editing capabilities, use the following command:
173
+
174
+ ```bash
175
+ python src/f5_tts/infer/speech_edit.py
176
+ ```
177
+
F5-TTS/src/f5_tts/infer/SHARED.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- omit in toc -->
2
+ # Shared Model Cards
3
+
4
+ <!-- omit in toc -->
5
+ ### **Prerequisites of using**
6
+ - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
7
+ - The models in this repository are open source and are based on voluntary contributions from contributors.
8
+ - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
9
+
10
+ <!-- omit in toc -->
11
+ ### **Welcome to share here**
12
+ - Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
13
+ - Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
14
+ - Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
15
+
16
+ <!-- omit in toc -->
17
+ ### Supported Languages
18
+ - [Multilingual](#multilingual)
19
+ - [F5-TTS v1 v0 Base @ zh \& en @ F5-TTS](#f5-tts-v1-v0-base--zh--en--f5-tts)
20
+ - [English](#english)
21
+ - [Finnish](#finnish)
22
+ - [F5-TTS Base @ fi @ AsmoKoskinen](#f5-tts-base--fi--asmokoskinen)
23
+ - [French](#french)
24
+ - [F5-TTS Base @ fr @ RASPIAUDIO](#f5-tts-base--fr--raspiaudio)
25
+ - [German](#german)
26
+ - [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
27
+ - [Hindi](#hindi)
28
+ - [F5-TTS Small @ hi @ SPRINGLab](#f5-tts-small--hi--springlab)
29
+ - [Italian](#italian)
30
+ - [F5-TTS Base @ it @ alien79](#f5-tts-base--it--alien79)
31
+ - [Japanese](#japanese)
32
+ - [F5-TTS Base @ ja @ Jmica](#f5-tts-base--ja--jmica)
33
+ - [Mandarin](#mandarin)
34
+ - [Russian](#russian)
35
+ - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
36
+ - [Spanish](#spanish)
37
+ - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
38
+
39
+
40
+ ## Multilingual
41
+
42
+ #### F5-TTS v1 v0 Base @ zh & en @ F5-TTS
43
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
44
+ |:---:|:------------:|:-----------:|:-------------:|
45
+ |F5-TTS v1 Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_v1_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
46
+
47
+ ```bash
48
+ Model: hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors
49
+ # A Variant Model: hf://SWivid/F5-TTS/F5TTS_v1_Base_no_zero_init/model_1250000.safetensors
50
+ Vocab: hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt
51
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
52
+ ```
53
+
54
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
55
+ |:---:|:------------:|:-----------:|:-------------:|
56
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
57
+
58
+ ```bash
59
+ Model: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
60
+ Vocab: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
61
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
62
+ ```
63
+
64
+ *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
65
+
66
+
67
+ ## English
68
+
69
+
70
+ ## Finnish
71
+
72
+ #### F5-TTS Base @ fi @ AsmoKoskinen
73
+ |Model|🤗Hugging Face|Data|Model License|
74
+ |:---:|:------------:|:-----------:|:-------------:|
75
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/AsmoKoskinen/F5-TTS_Finnish_Model)|[Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0), [Vox Populi](https://huggingface.co/datasets/facebook/voxpopuli)|cc-by-nc-4.0|
76
+
77
+ ```bash
78
+ Model: hf://AsmoKoskinen/F5-TTS_Finnish_Model/model_common_voice_fi_vox_populi_fi_20241206.safetensors
79
+ Vocab: hf://AsmoKoskinen/F5-TTS_Finnish_Model/vocab.txt
80
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
81
+ ```
82
+
83
+
84
+ ## French
85
+
86
+ #### F5-TTS Base @ fr @ RASPIAUDIO
87
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
88
+ |:---:|:------------:|:-----------:|:-------------:|
89
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
90
+
91
+ ```bash
92
+ Model: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
93
+ Vocab: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
94
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
95
+ ```
96
+
97
+ - [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
98
+ - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
99
+ - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
100
+
101
+
102
+ ## German
103
+
104
+ #### F5-TTS Base @ de @ hvoss-techfak
105
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
106
+ |:---:|:------------:|:-----------:|:-------------:|
107
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
108
+
109
+ ```bash
110
+ Model: hf://hvoss-techfak/F5-TTS-German/model_f5tts_german.pt
111
+ Vocab: hf://hvoss-techfak/F5-TTS-German/vocab.txt
112
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
113
+ ```
114
+
115
+ - Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)
116
+
117
+
118
+ ## Hindi
119
+
120
+ #### F5-TTS Small @ hi @ SPRINGLab
121
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
122
+ |:---:|:------------:|:-----------:|:-------------:|
123
+ |F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
124
+
125
+ ```bash
126
+ Model: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
127
+ Vocab: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
128
+ Config: {"dim": 768, "depth": 18, "heads": 12, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
129
+ ```
130
+
131
+ - Authors: SPRING Lab, Indian Institute of Technology, Madras
132
+ - Website: https://asr.iitm.ac.in/
133
+
134
+
135
+ ## Italian
136
+
137
+ #### F5-TTS Base @ it @ alien79
138
+ |Model|🤗Hugging Face|Data|Model License|
139
+ |:---:|:------------:|:-----------:|:-------------:|
140
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/alien79/F5-TTS-italian)|[ylacombe/cml-tts](https://huggingface.co/datasets/ylacombe/cml-tts) |cc-by-nc-4.0|
141
+
142
+ ```bash
143
+ Model: hf://alien79/F5-TTS-italian/model_159600.safetensors
144
+ Vocab: hf://alien79/F5-TTS-italian/vocab.txt
145
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
146
+ ```
147
+
148
+ - Trained by [Mithril Man](https://github.com/MithrilMan)
149
+ - Model details on [hf project home](https://huggingface.co/alien79/F5-TTS-italian)
150
+ - Open to collaborations to further improve the model
151
+
152
+
153
+ ## Japanese
154
+
155
+ #### F5-TTS Base @ ja @ Jmica
156
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
157
+ |:---:|:------------:|:-----------:|:-------------:|
158
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_21999120)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
159
+
160
+ ```bash
161
+ Model: hf://Jmica/F5TTS/JA_21999120/model_21999120.pt
162
+ Vocab: hf://Jmica/F5TTS/JA_21999120/vocab_japanese.txt
163
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
164
+ ```
165
+
166
+
167
+ ## Mandarin
168
+
169
+
170
+ ## Russian
171
+
172
+ #### F5-TTS Base @ ru @ HotDro4illa
173
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
174
+ |:---:|:------------:|:-----------:|:-------------:|
175
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/hotstone228/F5-TTS-Russian)|[Common voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0)|cc-by-nc-4.0|
176
+
177
+ ```bash
178
+ Model: hf://hotstone228/F5-TTS-Russian/model_last.safetensors
179
+ Vocab: hf://hotstone228/F5-TTS-Russian/vocab.txt
180
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
181
+ ```
182
+ - Finetuned by [HotDro4illa](https://github.com/HotDro4illa)
183
+ - Any improvements are welcome
184
+
185
+
186
+ ## Spanish
187
+
188
+ #### F5-TTS Base @ es @ jpgallegoar
189
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
190
+ |:---:|:------------:|:-----------:|:-------------:|
191
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
192
+
193
+ - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
F5-TTS/src/f5_tts/infer/__pycache__/utils_infer.cpython-311.pyc ADDED
Binary file (24.5 kB). View file
 
F5-TTS/src/f5_tts/infer/examples/basic/basic.toml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5TTS_v1_Base | E2TTS_Base
2
+ model = "F5TTS_v1_Base"
3
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = "Some call me nature, others call me mother nature."
6
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = ""
9
+ remove_silence = false
10
+ output_dir = "tests"
11
+ output_file = "infer_cli_basic.wav"
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0e22048e72414fcc1e6b6342e47a774d748a195ed34e4a5b3fcf416707f2b71
3
+ size 256018
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96724a113240d1f82c6ded1334122f0176b96c9226ccd3c919e625bcfd2a3ede
3
+ size 324558
F5-TTS/src/f5_tts/infer/examples/multi/country.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb15708b4b3875e37beec46591a5d89e1a9a63fdad3b8fe4a5c8738f4f554400
3
+ size 180321
F5-TTS/src/f5_tts/infer/examples/multi/main.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4abb1107771ce7e14926fde879b959dde6db6e572476b98684f04e45e978ab19
3
+ size 279219
F5-TTS/src/f5_tts/infer/examples/multi/story.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5TTS_v1_Base | E2TTS_Base
2
+ model = "F5TTS_v1_Base"
3
+ ref_audio = "infer/examples/multi/main.flac"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = ""
6
+ gen_text = ""
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = "infer/examples/multi/story.txt"
9
+ remove_silence = true
10
+ output_dir = "tests"
11
+ output_file = "infer_cli_story.wav"
12
+
13
+ [voices.town]
14
+ ref_audio = "infer/examples/multi/town.flac"
15
+ ref_text = ""
16
+ speed = 0.8 # will ignore global speed
17
+
18
+ [voices.country]
19
+ ref_audio = "infer/examples/multi/country.flac"
20
+ ref_text = ""
F5-TTS/src/f5_tts/infer/examples/multi/story.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] "My poor dear friend, you live here no better than the ants! Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land." [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] "Goodbye," [main] said he, [country] "I'm off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace."
F5-TTS/src/f5_tts/infer/examples/multi/town.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d069b8ebd5180c3b30fde5d378f0a1ddac96722d62cf43537efc3c3f3a3ce8
3
+ size 229383
F5-TTS/src/f5_tts/infer/examples/vocab.txt ADDED
@@ -0,0 +1,2545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #
5
+ $
6
+ %
7
+ &
8
+ '
9
+ (
10
+ )
11
+ *
12
+ +
13
+ ,
14
+ -
15
+ .
16
+ /
17
+ 0
18
+ 1
19
+ 2
20
+ 3
21
+ 4
22
+ 5
23
+ 6
24
+ 7
25
+ 8
26
+ 9
27
+ :
28
+ ;
29
+ =
30
+ >
31
+ ?
32
+ @
33
+ A
34
+ B
35
+ C
36
+ D
37
+ E
38
+ F
39
+ G
40
+ H
41
+ I
42
+ J
43
+ K
44
+ L
45
+ M
46
+ N
47
+ O
48
+ P
49
+ Q
50
+ R
51
+ S
52
+ T
53
+ U
54
+ V
55
+ W
56
+ X
57
+ Y
58
+ Z
59
+ [
60
+ \
61
+ ]
62
+ _
63
+ a
64
+ a1
65
+ ai1
66
+ ai2
67
+ ai3
68
+ ai4
69
+ an1
70
+ an3
71
+ an4
72
+ ang1
73
+ ang2
74
+ ang4
75
+ ao1
76
+ ao2
77
+ ao3
78
+ ao4
79
+ b
80
+ ba
81
+ ba1
82
+ ba2
83
+ ba3
84
+ ba4
85
+ bai1
86
+ bai2
87
+ bai3
88
+ bai4
89
+ ban1
90
+ ban2
91
+ ban3
92
+ ban4
93
+ bang1
94
+ bang2
95
+ bang3
96
+ bang4
97
+ bao1
98
+ bao2
99
+ bao3
100
+ bao4
101
+ bei
102
+ bei1
103
+ bei2
104
+ bei3
105
+ bei4
106
+ ben1
107
+ ben2
108
+ ben3
109
+ ben4
110
+ beng
111
+ beng1
112
+ beng2
113
+ beng3
114
+ beng4
115
+ bi1
116
+ bi2
117
+ bi3
118
+ bi4
119
+ bian1
120
+ bian2
121
+ bian3
122
+ bian4
123
+ biao1
124
+ biao2
125
+ biao3
126
+ bie1
127
+ bie2
128
+ bie3
129
+ bie4
130
+ bin1
131
+ bin4
132
+ bing1
133
+ bing2
134
+ bing3
135
+ bing4
136
+ bo
137
+ bo1
138
+ bo2
139
+ bo3
140
+ bo4
141
+ bu2
142
+ bu3
143
+ bu4
144
+ c
145
+ ca1
146
+ cai1
147
+ cai2
148
+ cai3
149
+ cai4
150
+ can1
151
+ can2
152
+ can3
153
+ can4
154
+ cang1
155
+ cang2
156
+ cao1
157
+ cao2
158
+ cao3
159
+ ce4
160
+ cen1
161
+ cen2
162
+ ceng1
163
+ ceng2
164
+ ceng4
165
+ cha1
166
+ cha2
167
+ cha3
168
+ cha4
169
+ chai1
170
+ chai2
171
+ chan1
172
+ chan2
173
+ chan3
174
+ chan4
175
+ chang1
176
+ chang2
177
+ chang3
178
+ chang4
179
+ chao1
180
+ chao2
181
+ chao3
182
+ che1
183
+ che2
184
+ che3
185
+ che4
186
+ chen1
187
+ chen2
188
+ chen3
189
+ chen4
190
+ cheng1
191
+ cheng2
192
+ cheng3
193
+ cheng4
194
+ chi1
195
+ chi2
196
+ chi3
197
+ chi4
198
+ chong1
199
+ chong2
200
+ chong3
201
+ chong4
202
+ chou1
203
+ chou2
204
+ chou3
205
+ chou4
206
+ chu1
207
+ chu2
208
+ chu3
209
+ chu4
210
+ chua1
211
+ chuai1
212
+ chuai2
213
+ chuai3
214
+ chuai4
215
+ chuan1
216
+ chuan2
217
+ chuan3
218
+ chuan4
219
+ chuang1
220
+ chuang2
221
+ chuang3
222
+ chuang4
223
+ chui1
224
+ chui2
225
+ chun1
226
+ chun2
227
+ chun3
228
+ chuo1
229
+ chuo4
230
+ ci1
231
+ ci2
232
+ ci3
233
+ ci4
234
+ cong1
235
+ cong2
236
+ cou4
237
+ cu1
238
+ cu4
239
+ cuan1
240
+ cuan2
241
+ cuan4
242
+ cui1
243
+ cui3
244
+ cui4
245
+ cun1
246
+ cun2
247
+ cun4
248
+ cuo1
249
+ cuo2
250
+ cuo4
251
+ d
252
+ da
253
+ da1
254
+ da2
255
+ da3
256
+ da4
257
+ dai1
258
+ dai2
259
+ dai3
260
+ dai4
261
+ dan1
262
+ dan2
263
+ dan3
264
+ dan4
265
+ dang1
266
+ dang2
267
+ dang3
268
+ dang4
269
+ dao1
270
+ dao2
271
+ dao3
272
+ dao4
273
+ de
274
+ de1
275
+ de2
276
+ dei3
277
+ den4
278
+ deng1
279
+ deng2
280
+ deng3
281
+ deng4
282
+ di1
283
+ di2
284
+ di3
285
+ di4
286
+ dia3
287
+ dian1
288
+ dian2
289
+ dian3
290
+ dian4
291
+ diao1
292
+ diao3
293
+ diao4
294
+ die1
295
+ die2
296
+ die4
297
+ ding1
298
+ ding2
299
+ ding3
300
+ ding4
301
+ diu1
302
+ dong1
303
+ dong3
304
+ dong4
305
+ dou1
306
+ dou2
307
+ dou3
308
+ dou4
309
+ du1
310
+ du2
311
+ du3
312
+ du4
313
+ duan1
314
+ duan2
315
+ duan3
316
+ duan4
317
+ dui1
318
+ dui4
319
+ dun1
320
+ dun3
321
+ dun4
322
+ duo1
323
+ duo2
324
+ duo3
325
+ duo4
326
+ e
327
+ e1
328
+ e2
329
+ e3
330
+ e4
331
+ ei2
332
+ en1
333
+ en4
334
+ er
335
+ er2
336
+ er3
337
+ er4
338
+ f
339
+ fa1
340
+ fa2
341
+ fa3
342
+ fa4
343
+ fan1
344
+ fan2
345
+ fan3
346
+ fan4
347
+ fang1
348
+ fang2
349
+ fang3
350
+ fang4
351
+ fei1
352
+ fei2
353
+ fei3
354
+ fei4
355
+ fen1
356
+ fen2
357
+ fen3
358
+ fen4
359
+ feng1
360
+ feng2
361
+ feng3
362
+ feng4
363
+ fo2
364
+ fou2
365
+ fou3
366
+ fu1
367
+ fu2
368
+ fu3
369
+ fu4
370
+ g
371
+ ga1
372
+ ga2
373
+ ga3
374
+ ga4
375
+ gai1
376
+ gai2
377
+ gai3
378
+ gai4
379
+ gan1
380
+ gan2
381
+ gan3
382
+ gan4
383
+ gang1
384
+ gang2
385
+ gang3
386
+ gang4
387
+ gao1
388
+ gao2
389
+ gao3
390
+ gao4
391
+ ge1
392
+ ge2
393
+ ge3
394
+ ge4
395
+ gei2
396
+ gei3
397
+ gen1
398
+ gen2
399
+ gen3
400
+ gen4
401
+ geng1
402
+ geng3
403
+ geng4
404
+ gong1
405
+ gong3
406
+ gong4
407
+ gou1
408
+ gou2
409
+ gou3
410
+ gou4
411
+ gu
412
+ gu1
413
+ gu2
414
+ gu3
415
+ gu4
416
+ gua1
417
+ gua2
418
+ gua3
419
+ gua4
420
+ guai1
421
+ guai2
422
+ guai3
423
+ guai4
424
+ guan1
425
+ guan2
426
+ guan3
427
+ guan4
428
+ guang1
429
+ guang2
430
+ guang3
431
+ guang4
432
+ gui1
433
+ gui2
434
+ gui3
435
+ gui4
436
+ gun3
437
+ gun4
438
+ guo1
439
+ guo2
440
+ guo3
441
+ guo4
442
+ h
443
+ ha1
444
+ ha2
445
+ ha3
446
+ hai1
447
+ hai2
448
+ hai3
449
+ hai4
450
+ han1
451
+ han2
452
+ han3
453
+ han4
454
+ hang1
455
+ hang2
456
+ hang4
457
+ hao1
458
+ hao2
459
+ hao3
460
+ hao4
461
+ he1
462
+ he2
463
+ he4
464
+ hei1
465
+ hen2
466
+ hen3
467
+ hen4
468
+ heng1
469
+ heng2
470
+ heng4
471
+ hong1
472
+ hong2
473
+ hong3
474
+ hong4
475
+ hou1
476
+ hou2
477
+ hou3
478
+ hou4
479
+ hu1
480
+ hu2
481
+ hu3
482
+ hu4
483
+ hua1
484
+ hua2
485
+ hua4
486
+ huai2
487
+ huai4
488
+ huan1
489
+ huan2
490
+ huan3
491
+ huan4
492
+ huang1
493
+ huang2
494
+ huang3
495
+ huang4
496
+ hui1
497
+ hui2
498
+ hui3
499
+ hui4
500
+ hun1
501
+ hun2
502
+ hun4
503
+ huo
504
+ huo1
505
+ huo2
506
+ huo3
507
+ huo4
508
+ i
509
+ j
510
+ ji1
511
+ ji2
512
+ ji3
513
+ ji4
514
+ jia
515
+ jia1
516
+ jia2
517
+ jia3
518
+ jia4
519
+ jian1
520
+ jian2
521
+ jian3
522
+ jian4
523
+ jiang1
524
+ jiang2
525
+ jiang3
526
+ jiang4
527
+ jiao1
528
+ jiao2
529
+ jiao3
530
+ jiao4
531
+ jie1
532
+ jie2
533
+ jie3
534
+ jie4
535
+ jin1
536
+ jin2
537
+ jin3
538
+ jin4
539
+ jing1
540
+ jing2
541
+ jing3
542
+ jing4
543
+ jiong3
544
+ jiu1
545
+ jiu2
546
+ jiu3
547
+ jiu4
548
+ ju1
549
+ ju2
550
+ ju3
551
+ ju4
552
+ juan1
553
+ juan2
554
+ juan3
555
+ juan4
556
+ jue1
557
+ jue2
558
+ jue4
559
+ jun1
560
+ jun4
561
+ k
562
+ ka1
563
+ ka2
564
+ ka3
565
+ kai1
566
+ kai2
567
+ kai3
568
+ kai4
569
+ kan1
570
+ kan2
571
+ kan3
572
+ kan4
573
+ kang1
574
+ kang2
575
+ kang4
576
+ kao1
577
+ kao2
578
+ kao3
579
+ kao4
580
+ ke1
581
+ ke2
582
+ ke3
583
+ ke4
584
+ ken3
585
+ keng1
586
+ kong1
587
+ kong3
588
+ kong4
589
+ kou1
590
+ kou2
591
+ kou3
592
+ kou4
593
+ ku1
594
+ ku2
595
+ ku3
596
+ ku4
597
+ kua1
598
+ kua3
599
+ kua4
600
+ kuai3
601
+ kuai4
602
+ kuan1
603
+ kuan2
604
+ kuan3
605
+ kuang1
606
+ kuang2
607
+ kuang4
608
+ kui1
609
+ kui2
610
+ kui3
611
+ kui4
612
+ kun1
613
+ kun3
614
+ kun4
615
+ kuo4
616
+ l
617
+ la
618
+ la1
619
+ la2
620
+ la3
621
+ la4
622
+ lai2
623
+ lai4
624
+ lan2
625
+ lan3
626
+ lan4
627
+ lang1
628
+ lang2
629
+ lang3
630
+ lang4
631
+ lao1
632
+ lao2
633
+ lao3
634
+ lao4
635
+ le
636
+ le1
637
+ le4
638
+ lei
639
+ lei1
640
+ lei2
641
+ lei3
642
+ lei4
643
+ leng1
644
+ leng2
645
+ leng3
646
+ leng4
647
+ li
648
+ li1
649
+ li2
650
+ li3
651
+ li4
652
+ lia3
653
+ lian2
654
+ lian3
655
+ lian4
656
+ liang2
657
+ liang3
658
+ liang4
659
+ liao1
660
+ liao2
661
+ liao3
662
+ liao4
663
+ lie1
664
+ lie2
665
+ lie3
666
+ lie4
667
+ lin1
668
+ lin2
669
+ lin3
670
+ lin4
671
+ ling2
672
+ ling3
673
+ ling4
674
+ liu1
675
+ liu2
676
+ liu3
677
+ liu4
678
+ long1
679
+ long2
680
+ long3
681
+ long4
682
+ lou1
683
+ lou2
684
+ lou3
685
+ lou4
686
+ lu1
687
+ lu2
688
+ lu3
689
+ lu4
690
+ luan2
691
+ luan3
692
+ luan4
693
+ lun1
694
+ lun2
695
+ lun4
696
+ luo1
697
+ luo2
698
+ luo3
699
+ luo4
700
+ lv2
701
+ lv3
702
+ lv4
703
+ lve3
704
+ lve4
705
+ m
706
+ ma
707
+ ma1
708
+ ma2
709
+ ma3
710
+ ma4
711
+ mai2
712
+ mai3
713
+ mai4
714
+ man1
715
+ man2
716
+ man3
717
+ man4
718
+ mang2
719
+ mang3
720
+ mao1
721
+ mao2
722
+ mao3
723
+ mao4
724
+ me
725
+ mei2
726
+ mei3
727
+ mei4
728
+ men
729
+ men1
730
+ men2
731
+ men4
732
+ meng
733
+ meng1
734
+ meng2
735
+ meng3
736
+ meng4
737
+ mi1
738
+ mi2
739
+ mi3
740
+ mi4
741
+ mian2
742
+ mian3
743
+ mian4
744
+ miao1
745
+ miao2
746
+ miao3
747
+ miao4
748
+ mie1
749
+ mie4
750
+ min2
751
+ min3
752
+ ming2
753
+ ming3
754
+ ming4
755
+ miu4
756
+ mo1
757
+ mo2
758
+ mo3
759
+ mo4
760
+ mou1
761
+ mou2
762
+ mou3
763
+ mu2
764
+ mu3
765
+ mu4
766
+ n
767
+ n2
768
+ na1
769
+ na2
770
+ na3
771
+ na4
772
+ nai2
773
+ nai3
774
+ nai4
775
+ nan1
776
+ nan2
777
+ nan3
778
+ nan4
779
+ nang1
780
+ nang2
781
+ nang3
782
+ nao1
783
+ nao2
784
+ nao3
785
+ nao4
786
+ ne
787
+ ne2
788
+ ne4
789
+ nei3
790
+ nei4
791
+ nen4
792
+ neng2
793
+ ni1
794
+ ni2
795
+ ni3
796
+ ni4
797
+ nian1
798
+ nian2
799
+ nian3
800
+ nian4
801
+ niang2
802
+ niang4
803
+ niao2
804
+ niao3
805
+ niao4
806
+ nie1
807
+ nie4
808
+ nin2
809
+ ning2
810
+ ning3
811
+ ning4
812
+ niu1
813
+ niu2
814
+ niu3
815
+ niu4
816
+ nong2
817
+ nong4
818
+ nou4
819
+ nu2
820
+ nu3
821
+ nu4
822
+ nuan3
823
+ nuo2
824
+ nuo4
825
+ nv2
826
+ nv3
827
+ nve4
828
+ o
829
+ o1
830
+ o2
831
+ ou1
832
+ ou2
833
+ ou3
834
+ ou4
835
+ p
836
+ pa1
837
+ pa2
838
+ pa4
839
+ pai1
840
+ pai2
841
+ pai3
842
+ pai4
843
+ pan1
844
+ pan2
845
+ pan4
846
+ pang1
847
+ pang2
848
+ pang4
849
+ pao1
850
+ pao2
851
+ pao3
852
+ pao4
853
+ pei1
854
+ pei2
855
+ pei4
856
+ pen1
857
+ pen2
858
+ pen4
859
+ peng1
860
+ peng2
861
+ peng3
862
+ peng4
863
+ pi1
864
+ pi2
865
+ pi3
866
+ pi4
867
+ pian1
868
+ pian2
869
+ pian4
870
+ piao1
871
+ piao2
872
+ piao3
873
+ piao4
874
+ pie1
875
+ pie2
876
+ pie3
877
+ pin1
878
+ pin2
879
+ pin3
880
+ pin4
881
+ ping1
882
+ ping2
883
+ po1
884
+ po2
885
+ po3
886
+ po4
887
+ pou1
888
+ pu1
889
+ pu2
890
+ pu3
891
+ pu4
892
+ q
893
+ qi1
894
+ qi2
895
+ qi3
896
+ qi4
897
+ qia1
898
+ qia3
899
+ qia4
900
+ qian1
901
+ qian2
902
+ qian3
903
+ qian4
904
+ qiang1
905
+ qiang2
906
+ qiang3
907
+ qiang4
908
+ qiao1
909
+ qiao2
910
+ qiao3
911
+ qiao4
912
+ qie1
913
+ qie2
914
+ qie3
915
+ qie4
916
+ qin1
917
+ qin2
918
+ qin3
919
+ qin4
920
+ qing1
921
+ qing2
922
+ qing3
923
+ qing4
924
+ qiong1
925
+ qiong2
926
+ qiu1
927
+ qiu2
928
+ qiu3
929
+ qu1
930
+ qu2
931
+ qu3
932
+ qu4
933
+ quan1
934
+ quan2
935
+ quan3
936
+ quan4
937
+ que1
938
+ que2
939
+ que4
940
+ qun2
941
+ r
942
+ ran2
943
+ ran3
944
+ rang1
945
+ rang2
946
+ rang3
947
+ rang4
948
+ rao2
949
+ rao3
950
+ rao4
951
+ re2
952
+ re3
953
+ re4
954
+ ren2
955
+ ren3
956
+ ren4
957
+ reng1
958
+ reng2
959
+ ri4
960
+ rong1
961
+ rong2
962
+ rong3
963
+ rou2
964
+ rou4
965
+ ru2
966
+ ru3
967
+ ru4
968
+ ruan2
969
+ ruan3
970
+ rui3
971
+ rui4
972
+ run4
973
+ ruo4
974
+ s
975
+ sa1
976
+ sa2
977
+ sa3
978
+ sa4
979
+ sai1
980
+ sai4
981
+ san1
982
+ san2
983
+ san3
984
+ san4
985
+ sang1
986
+ sang3
987
+ sang4
988
+ sao1
989
+ sao2
990
+ sao3
991
+ sao4
992
+ se4
993
+ sen1
994
+ seng1
995
+ sha1
996
+ sha2
997
+ sha3
998
+ sha4
999
+ shai1
1000
+ shai2
1001
+ shai3
1002
+ shai4
1003
+ shan1
1004
+ shan3
1005
+ shan4
1006
+ shang
1007
+ shang1
1008
+ shang3
1009
+ shang4
1010
+ shao1
1011
+ shao2
1012
+ shao3
1013
+ shao4
1014
+ she1
1015
+ she2
1016
+ she3
1017
+ she4
1018
+ shei2
1019
+ shen1
1020
+ shen2
1021
+ shen3
1022
+ shen4
1023
+ sheng1
1024
+ sheng2
1025
+ sheng3
1026
+ sheng4
1027
+ shi
1028
+ shi1
1029
+ shi2
1030
+ shi3
1031
+ shi4
1032
+ shou1
1033
+ shou2
1034
+ shou3
1035
+ shou4
1036
+ shu1
1037
+ shu2
1038
+ shu3
1039
+ shu4
1040
+ shua1
1041
+ shua2
1042
+ shua3
1043
+ shua4
1044
+ shuai1
1045
+ shuai3
1046
+ shuai4
1047
+ shuan1
1048
+ shuan4
1049
+ shuang1
1050
+ shuang3
1051
+ shui2
1052
+ shui3
1053
+ shui4
1054
+ shun3
1055
+ shun4
1056
+ shuo1
1057
+ shuo4
1058
+ si1
1059
+ si2
1060
+ si3
1061
+ si4
1062
+ song1
1063
+ song3
1064
+ song4
1065
+ sou1
1066
+ sou3
1067
+ sou4
1068
+ su1
1069
+ su2
1070
+ su4
1071
+ suan1
1072
+ suan4
1073
+ sui1
1074
+ sui2
1075
+ sui3
1076
+ sui4
1077
+ sun1
1078
+ sun3
1079
+ suo
1080
+ suo1
1081
+ suo2
1082
+ suo3
1083
+ t
1084
+ ta1
1085
+ ta2
1086
+ ta3
1087
+ ta4
1088
+ tai1
1089
+ tai2
1090
+ tai4
1091
+ tan1
1092
+ tan2
1093
+ tan3
1094
+ tan4
1095
+ tang1
1096
+ tang2
1097
+ tang3
1098
+ tang4
1099
+ tao1
1100
+ tao2
1101
+ tao3
1102
+ tao4
1103
+ te4
1104
+ teng2
1105
+ ti1
1106
+ ti2
1107
+ ti3
1108
+ ti4
1109
+ tian1
1110
+ tian2
1111
+ tian3
1112
+ tiao1
1113
+ tiao2
1114
+ tiao3
1115
+ tiao4
1116
+ tie1
1117
+ tie2
1118
+ tie3
1119
+ tie4
1120
+ ting1
1121
+ ting2
1122
+ ting3
1123
+ tong1
1124
+ tong2
1125
+ tong3
1126
+ tong4
1127
+ tou
1128
+ tou1
1129
+ tou2
1130
+ tou4
1131
+ tu1
1132
+ tu2
1133
+ tu3
1134
+ tu4
1135
+ tuan1
1136
+ tuan2
1137
+ tui1
1138
+ tui2
1139
+ tui3
1140
+ tui4
1141
+ tun1
1142
+ tun2
1143
+ tun4
1144
+ tuo1
1145
+ tuo2
1146
+ tuo3
1147
+ tuo4
1148
+ u
1149
+ v
1150
+ w
1151
+ wa
1152
+ wa1
1153
+ wa2
1154
+ wa3
1155
+ wa4
1156
+ wai1
1157
+ wai3
1158
+ wai4
1159
+ wan1
1160
+ wan2
1161
+ wan3
1162
+ wan4
1163
+ wang1
1164
+ wang2
1165
+ wang3
1166
+ wang4
1167
+ wei1
1168
+ wei2
1169
+ wei3
1170
+ wei4
1171
+ wen1
1172
+ wen2
1173
+ wen3
1174
+ wen4
1175
+ weng1
1176
+ weng4
1177
+ wo1
1178
+ wo2
1179
+ wo3
1180
+ wo4
1181
+ wu1
1182
+ wu2
1183
+ wu3
1184
+ wu4
1185
+ x
1186
+ xi1
1187
+ xi2
1188
+ xi3
1189
+ xi4
1190
+ xia1
1191
+ xia2
1192
+ xia4
1193
+ xian1
1194
+ xian2
1195
+ xian3
1196
+ xian4
1197
+ xiang1
1198
+ xiang2
1199
+ xiang3
1200
+ xiang4
1201
+ xiao1
1202
+ xiao2
1203
+ xiao3
1204
+ xiao4
1205
+ xie1
1206
+ xie2
1207
+ xie3
1208
+ xie4
1209
+ xin1
1210
+ xin2
1211
+ xin4
1212
+ xing1
1213
+ xing2
1214
+ xing3
1215
+ xing4
1216
+ xiong1
1217
+ xiong2
1218
+ xiu1
1219
+ xiu3
1220
+ xiu4
1221
+ xu
1222
+ xu1
1223
+ xu2
1224
+ xu3
1225
+ xu4
1226
+ xuan1
1227
+ xuan2
1228
+ xuan3
1229
+ xuan4
1230
+ xue1
1231
+ xue2
1232
+ xue3
1233
+ xue4
1234
+ xun1
1235
+ xun2
1236
+ xun4
1237
+ y
1238
+ ya
1239
+ ya1
1240
+ ya2
1241
+ ya3
1242
+ ya4
1243
+ yan1
1244
+ yan2
1245
+ yan3
1246
+ yan4
1247
+ yang1
1248
+ yang2
1249
+ yang3
1250
+ yang4
1251
+ yao1
1252
+ yao2
1253
+ yao3
1254
+ yao4
1255
+ ye1
1256
+ ye2
1257
+ ye3
1258
+ ye4
1259
+ yi
1260
+ yi1
1261
+ yi2
1262
+ yi3
1263
+ yi4
1264
+ yin1
1265
+ yin2
1266
+ yin3
1267
+ yin4
1268
+ ying1
1269
+ ying2
1270
+ ying3
1271
+ ying4
1272
+ yo1
1273
+ yong1
1274
+ yong2
1275
+ yong3
1276
+ yong4
1277
+ you1
1278
+ you2
1279
+ you3
1280
+ you4
1281
+ yu1
1282
+ yu2
1283
+ yu3
1284
+ yu4
1285
+ yuan1
1286
+ yuan2
1287
+ yuan3
1288
+ yuan4
1289
+ yue1
1290
+ yue4
1291
+ yun1
1292
+ yun2
1293
+ yun3
1294
+ yun4
1295
+ z
1296
+ za1
1297
+ za2
1298
+ za3
1299
+ zai1
1300
+ zai3
1301
+ zai4
1302
+ zan1
1303
+ zan2
1304
+ zan3
1305
+ zan4
1306
+ zang1
1307
+ zang4
1308
+ zao1
1309
+ zao2
1310
+ zao3
1311
+ zao4
1312
+ ze2
1313
+ ze4
1314
+ zei2
1315
+ zen3
1316
+ zeng1
1317
+ zeng4
1318
+ zha1
1319
+ zha2
1320
+ zha3
1321
+ zha4
1322
+ zhai1
1323
+ zhai2
1324
+ zhai3
1325
+ zhai4
1326
+ zhan1
1327
+ zhan2
1328
+ zhan3
1329
+ zhan4
1330
+ zhang1
1331
+ zhang2
1332
+ zhang3
1333
+ zhang4
1334
+ zhao1
1335
+ zhao2
1336
+ zhao3
1337
+ zhao4
1338
+ zhe
1339
+ zhe1
1340
+ zhe2
1341
+ zhe3
1342
+ zhe4
1343
+ zhen1
1344
+ zhen2
1345
+ zhen3
1346
+ zhen4
1347
+ zheng1
1348
+ zheng2
1349
+ zheng3
1350
+ zheng4
1351
+ zhi1
1352
+ zhi2
1353
+ zhi3
1354
+ zhi4
1355
+ zhong1
1356
+ zhong2
1357
+ zhong3
1358
+ zhong4
1359
+ zhou1
1360
+ zhou2
1361
+ zhou3
1362
+ zhou4
1363
+ zhu1
1364
+ zhu2
1365
+ zhu3
1366
+ zhu4
1367
+ zhua1
1368
+ zhua2
1369
+ zhua3
1370
+ zhuai1
1371
+ zhuai3
1372
+ zhuai4
1373
+ zhuan1
1374
+ zhuan2
1375
+ zhuan3
1376
+ zhuan4
1377
+ zhuang1
1378
+ zhuang4
1379
+ zhui1
1380
+ zhui4
1381
+ zhun1
1382
+ zhun2
1383
+ zhun3
1384
+ zhuo1
1385
+ zhuo2
1386
+ zi
1387
+ zi1
1388
+ zi2
1389
+ zi3
1390
+ zi4
1391
+ zong1
1392
+ zong2
1393
+ zong3
1394
+ zong4
1395
+ zou1
1396
+ zou2
1397
+ zou3
1398
+ zou4
1399
+ zu1
1400
+ zu2
1401
+ zu3
1402
+ zuan1
1403
+ zuan3
1404
+ zuan4
1405
+ zui2
1406
+ zui3
1407
+ zui4
1408
+ zun1
1409
+ zuo
1410
+ zuo1
1411
+ zuo2
1412
+ zuo3
1413
+ zuo4
1414
+ {
1415
+ ~
1416
+ ¡
1417
+ ¢
1418
+ £
1419
+ ¥
1420
+ §
1421
+ ¨
1422
+ ©
1423
+ «
1424
+ ®
1425
+ ¯
1426
+ °
1427
+ ±
1428
+ ²
1429
+ ³
1430
+ ´
1431
+ µ
1432
+ ·
1433
+ ¹
1434
+ º
1435
+ »
1436
+ ¼
1437
+ ½
1438
+ ¾
1439
+ ¿
1440
+ À
1441
+ Á
1442
+ Â
1443
+ Ã
1444
+ Ä
1445
+ Å
1446
+ Æ
1447
+ Ç
1448
+ È
1449
+ É
1450
+ Ê
1451
+ Í
1452
+ Î
1453
+ Ñ
1454
+ Ó
1455
+ Ö
1456
+ ×
1457
+ Ø
1458
+ Ú
1459
+ Ü
1460
+ Ý
1461
+ Þ
1462
+ ß
1463
+ à
1464
+ á
1465
+ â
1466
+ ã
1467
+ ä
1468
+ å
1469
+ æ
1470
+ ç
1471
+ è
1472
+ é
1473
+ ê
1474
+ ë
1475
+ ì
1476
+ í
1477
+ î
1478
+ ï
1479
+ ð
1480
+ ñ
1481
+ ò
1482
+ ó
1483
+ ô
1484
+ õ
1485
+ ö
1486
+ ø
1487
+ ù
1488
+ ú
1489
+ û
1490
+ ü
1491
+ ý
1492
+ Ā
1493
+ ā
1494
+ ă
1495
+ ą
1496
+ ć
1497
+ Č
1498
+ č
1499
+ Đ
1500
+ đ
1501
+ ē
1502
+ ė
1503
+ ę
1504
+ ě
1505
+ ĝ
1506
+ ğ
1507
+ ħ
1508
+ ī
1509
+ į
1510
+ İ
1511
+ ı
1512
+ Ł
1513
+ ł
1514
+ ń
1515
+ ņ
1516
+ ň
1517
+ ŋ
1518
+ Ō
1519
+ ō
1520
+ ő
1521
+ œ
1522
+ ř
1523
+ Ś
1524
+ ś
1525
+ Ş
1526
+ ş
1527
+ Š
1528
+ š
1529
+ Ť
1530
+ ť
1531
+ ũ
1532
+ ū
1533
+ ź
1534
+ Ż
1535
+ ż
1536
+ Ž
1537
+ ž
1538
+ ơ
1539
+ ư
1540
+ ǎ
1541
+ ǐ
1542
+ ǒ
1543
+ ǔ
1544
+ ǚ
1545
+ ș
1546
+ ț
1547
+ ɑ
1548
+ ɔ
1549
+ ɕ
1550
+ ə
1551
+ ɛ
1552
+ ɜ
1553
+ ɡ
1554
+ ɣ
1555
+ ɪ
1556
+ ɫ
1557
+ ɴ
1558
+ ɹ
1559
+ ɾ
1560
+ ʃ
1561
+ ʊ
1562
+ ʌ
1563
+ ʒ
1564
+ ʔ
1565
+ ʰ
1566
+ ʷ
1567
+ ʻ
1568
+ ʾ
1569
+ ʿ
1570
+ ˈ
1571
+ ː
1572
+ ˙
1573
+ ˜
1574
+ ˢ
1575
+ ́
1576
+ ̅
1577
+ Α
1578
+ Β
1579
+ Δ
1580
+ Ε
1581
+ Θ
1582
+ Κ
1583
+ Λ
1584
+ Μ
1585
+ Ξ
1586
+ Π
1587
+ Σ
1588
+ Τ
1589
+ Φ
1590
+ Χ
1591
+ Ψ
1592
+ Ω
1593
+ ά
1594
+ έ
1595
+ ή
1596
+ ί
1597
+ α
1598
+ β
1599
+ γ
1600
+ δ
1601
+ ε
1602
+ ζ
1603
+ η
1604
+ θ
1605
+ ι
1606
+ κ
1607
+ λ
1608
+ μ
1609
+ ν
1610
+ ξ
1611
+ ο
1612
+ π
1613
+ ρ
1614
+ ς
1615
+ σ
1616
+ τ
1617
+ υ
1618
+ φ
1619
+ χ
1620
+ ψ
1621
+ ω
1622
+ ϊ
1623
+ ό
1624
+ ύ
1625
+ ώ
1626
+ ϕ
1627
+ ϵ
1628
+ Ё
1629
+ А
1630
+ Б
1631
+ В
1632
+ Г
1633
+ Д
1634
+ Е
1635
+ Ж
1636
+ З
1637
+ И
1638
+ Й
1639
+ К
1640
+ Л
1641
+ М
1642
+ Н
1643
+ О
1644
+ П
1645
+ Р
1646
+ С
1647
+ Т
1648
+ У
1649
+ Ф
1650
+ Х
1651
+ Ц
1652
+ Ч
1653
+ Ш
1654
+ Щ
1655
+ Ы
1656
+ Ь
1657
+ Э
1658
+ Ю
1659
+ Я
1660
+ а
1661
+ б
1662
+ в
1663
+ г
1664
+ д
1665
+ е
1666
+ ж
1667
+ з
1668
+ и
1669
+ й
1670
+ к
1671
+ л
1672
+ м
1673
+ н
1674
+ о
1675
+ п
1676
+ р
1677
+ с
1678
+ т
1679
+ у
1680
+ ф
1681
+ х
1682
+ ц
1683
+ ч
1684
+ ш
1685
+ щ
1686
+ ъ
1687
+ ы
1688
+ ь
1689
+ э
1690
+ ю
1691
+ я
1692
+ ё
1693
+ і
1694
+ ְ
1695
+ ִ
1696
+ ֵ
1697
+ ֶ
1698
+ ַ
1699
+ ָ
1700
+ ֹ
1701
+ ּ
1702
+ ־
1703
+ ׁ
1704
+ א
1705
+ ב
1706
+ ג
1707
+ ד
1708
+ ה
1709
+ ו
1710
+ ז
1711
+ ח
1712
+ ט
1713
+ י
1714
+ כ
1715
+ ל
1716
+ ם
1717
+ מ
1718
+ ן
1719
+ נ
1720
+ ס
1721
+ ע
1722
+ פ
1723
+ ק
1724
+ ר
1725
+ ש
1726
+ ת
1727
+ أ
1728
+ ب
1729
+ ة
1730
+ ت
1731
+ ج
1732
+ ح
1733
+ د
1734
+ ر
1735
+ ز
1736
+ س
1737
+ ص
1738
+ ط
1739
+ ع
1740
+ ق
1741
+ ك
1742
+ ل
1743
+ م
1744
+ ن
1745
+ ه
1746
+ و
1747
+ ي
1748
+ َ
1749
+ ُ
1750
+ ِ
1751
+ ْ
1752
+
1753
+
1754
+
1755
+
1756
+
1757
+
1758
+
1759
+
1760
+
1761
+
1762
+
1763
+
1764
+
1765
+
1766
+
1767
+
1768
+
1769
+
1770
+
1771
+
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+
1782
+
1783
+
1784
+
1785
+
1786
+
1787
+
1788
+
1789
+
1790
+
1791
+
1792
+
1793
+
1794
+
1795
+
1796
+
1797
+
1798
+
1799
+
1800
+ ế
1801
+
1802
+
1803
+
1804
+
1805
+
1806
+
1807
+
1808
+
1809
+
1810
+
1811
+
1812
+
1813
+
1814
+
1815
+
1816
+
1817
+
1818
+
1819
+
1820
+
1821
+
1822
+
1823
+
1824
+
1825
+
1826
+
1827
+
1828
+
1829
+
1830
+ ���
1831
+
1832
+
1833
+
1834
+
1835
+
1836
+
1837
+
1838
+
1839
+
1840
+
1841
+
1842
+
1843
+
1844
+
1845
+
1846
+
1847
+
1848
+
1849
+
1850
+
1851
+
1852
+
1853
+
1854
+
1855
+
1856
+
1857
+
1858
+
1859
+
1860
+
1861
+
1862
+
1863
+
1864
+
1865
+
1866
+
1867
+
1868
+
1869
+
1870
+
1871
+
1872
+
1873
+
1874
+
1875
+
1876
+
1877
+
1878
+
1879
+
1880
+
1881
+
1882
+
1883
+
1884
+
1885
+
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+
1892
+
1893
+
1894
+
1895
+
1896
+
1897
+
1898
+
1899
+
1900
+
1901
+
1902
+
1903
+
1904
+
1905
+
1906
+
1907
+
1908
+
1909
+
1910
+
1911
+
1912
+
1913
+
1914
+
1915
+
1916
+
1917
+
1918
+
1919
+
1920
+
1921
+
1922
+
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
+
1930
+
1931
+
1932
+
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+
1950
+
1951
+
1952
+
1953
+
1954
+
1955
+
1956
+
1957
+
1958
+
1959
+
1960
+
1961
+
1962
+
1963
+
1964
+
1965
+
1966
+
1967
+
1968
+
1969
+
1970
+
1971
+
1972
+
1973
+
1974
+
1975
+
1976
+
1977
+
1978
+
1979
+
1980
+
1981
+
1982
+
1983
+
1984
+
1985
+
1986
+
1987
+
1988
+
1989
+
1990
+
1991
+
1992
+
1993
+
1994
+
1995
+
1996
+
1997
+
1998
+
1999
+
2000
+
2001
+
2002
+
2003
+
2004
+
2005
+
2006
+
2007
+
2008
+
2009
+
2010
+
2011
+
2012
+
2013
+
2014
+
2015
+
2016
+
2017
+
2018
+
2019
+
2020
+
2021
+
2022
+
2023
+
2024
+
2025
+
2026
+
2027
+
2028
+
2029
+
2030
+
2031
+
2032
+
2033
+
2034
+
2035
+
2036
+
2037
+
2038
+
2039
+
2040
+
2041
+
2042
+
2043
+
2044
+
2045
+
2046
+
2047
+
2048
+
2049
+
2050
+
2051
+
2052
+
2053
+
2054
+
2055
+
2056
+
2057
+
2058
+
2059
+
2060
+
2061
+
2062
+
2063
+
2064
+
2065
+
2066
+
2067
+
2068
+
2069
+
2070
+
2071
+
2072
+
2073
+
2074
+
2075
+
2076
+
2077
+
2078
+
2079
+
2080
+
2081
+
2082
+
2083
+
2084
+
2085
+
2086
+
2087
+
2088
+
2089
+
2090
+
2091
+
2092
+
2093
+
2094
+
2095
+
2096
+
2097
+
2098
+
2099
+
2100
+
2101
+
2102
+
2103
+
2104
+
2105
+
2106
+
2107
+
2108
+
2109
+
2110
+
2111
+
2112
+
2113
+
2114
+
2115
+
2116
+
2117
+
2118
+
2119
+
2120
+
2121
+
2122
+
2123
+
2124
+
2125
+
2126
+
2127
+
2128
+
2129
+
2130
+
2131
+
2132
+
2133
+
2134
+
2135
+
2136
+
2137
+
2138
+
2139
+
2140
+
2141
+
2142
+
2143
+
2144
+
2145
+
2146
+
2147
+
2148
+
2149
+
2150
+
2151
+
2152
+
2153
+
2154
+
2155
+
2156
+
2157
+
2158
+
2159
+
2160
+
2161
+
2162
+
2163
+
2164
+
2165
+
2166
+
2167
+
2168
+
2169
+
2170
+
2171
+
2172
+
2173
+
2174
+
2175
+
2176
+
2177
+
2178
+
2179
+
2180
+
2181
+
2182
+
2183
+
2184
+
2185
+
2186
+
2187
+
2188
+
2189
+
2190
+
2191
+
2192
+
2193
+
2194
+
2195
+
2196
+
2197
+
2198
+
2199
+
2200
+
2201
+
2202
+
2203
+
2204
+
2205
+
2206
+
2207
+
2208
+
2209
+
2210
+
2211
+
2212
+
2213
+
2214
+
2215
+
2216
+
2217
+
2218
+
2219
+
2220
+
2221
+
2222
+
2223
+
2224
+
2225
+
2226
+
2227
+
2228
+
2229
+
2230
+
2231
+
2232
+
2233
+
2234
+
2235
+
2236
+
2237
+
2238
+
2239
+
2240
+
2241
+
2242
+
2243
+
2244
+
2245
+
2246
+
2247
+
2248
+
2249
+
2250
+
2251
+
2252
+
2253
+
2254
+
2255
+
2256
+
2257
+
2258
+
2259
+
2260
+
2261
+
2262
+
2263
+
2264
+
2265
+
2266
+
2267
+
2268
+
2269
+
2270
+
2271
+
2272
+
2273
+
2274
+
2275
+
2276
+
2277
+
2278
+
2279
+
2280
+
2281
+
2282
+
2283
+
2284
+
2285
+
2286
+
2287
+
2288
+
2289
+
2290
+
2291
+
2292
+
2293
+
2294
+
2295
+
2296
+
2297
+
2298
+
2299
+
2300
+
2301
+
2302
+
2303
+
2304
+
2305
+
2306
+
2307
+
2308
+
2309
+
2310
+
2311
+
2312
+
2313
+
2314
+
2315
+
2316
+
2317
+
2318
+
2319
+
2320
+
2321
+
2322
+
2323
+
2324
+
2325
+
2326
+
2327
+
2328
+
2329
+
2330
+
2331
+
2332
+
2333
+
2334
+
2335
+
2336
+
2337
+
2338
+
2339
+
2340
+
2341
+
2342
+
2343
+
2344
+
2345
+
2346
+
2347
+
2348
+
2349
+
2350
+
2351
+
2352
+
2353
+
2354
+
2355
+
2356
+
2357
+
2358
+
2359
+
2360
+
2361
+
2362
+
2363
+
2364
+
2365
+
2366
+
2367
+
2368
+
2369
+
2370
+
2371
+
2372
+
2373
+
2374
+
2375
+
2376
+
2377
+
2378
+
2379
+
2380
+
2381
+
2382
+
2383
+
2384
+
2385
+
2386
+
2387
+
2388
+
2389
+
2390
+
2391
+
2392
+
2393
+
2394
+
2395
+
2396
+
2397
+
2398
+
2399
+
2400
+
2401
+
2402
+
2403
+
2404
+
2405
+
2406
+
2407
+
2408
+
2409
+
2410
+
2411
+
2412
+
2413
+
2414
+
2415
+
2416
+
2417
+
2418
+
2419
+
2420
+
2421
+
2422
+
2423
+
2424
+
2425
+
2426
+
2427
+
2428
+
2429
+
2430
+
2431
+
2432
+
2433
+
2434
+
2435
+
2436
+
2437
+
2438
+
2439
+
2440
+
2441
+
2442
+
2443
+
2444
+
2445
+
2446
+
2447
+
2448
+
2449
+
2450
+
2451
+
2452
+
2453
+
2454
+
2455
+
2456
+
2457
+
2458
+
2459
+
2460
+
2461
+
2462
+
2463
+
2464
+
2465
+
2466
+
2467
+
2468
+
2469
+
2470
+
2471
+
2472
+
2473
+
2474
+
2475
+
2476
+
2477
+
2478
+
2479
+
2480
+
2481
+
2482
+
2483
+
2484
+
2485
+
2486
+
2487
+
2488
+
2489
+
2490
+
2491
+
2492
+
2493
+
2494
+
2495
+
2496
+
2497
+
2498
+
2499
+
2500
+
2501
+
2502
+
2503
+
2504
+
2505
+
2506
+
2507
+
2508
+
2509
+
2510
+
2511
+
2512
+
2513
+
2514
+
2515
+
2516
+
2517
+
2518
+
2519
+
2520
+
2521
+
2522
+
2523
+
2524
+
2525
+
2526
+
2527
+
2528
+
2529
+
2530
+
2531
+
2532
+
2533
+
2534
+
2535
+
2536
+
2537
+
2538
+
2539
+
2540
+
2541
+
2542
+
2543
+
2544
+
2545
+ 𠮶
F5-TTS/src/f5_tts/infer/infer_cli.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import codecs
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+ from importlib.resources import files
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import tomli
12
+ from cached_path import cached_path
13
+ from hydra.utils import get_class
14
+ from omegaconf import OmegaConf
15
+ from unidecode import unidecode
16
+
17
+ from f5_tts.infer.utils_infer import (
18
+ cfg_strength,
19
+ cross_fade_duration,
20
+ device,
21
+ fix_duration,
22
+ infer_process,
23
+ load_model,
24
+ load_vocoder,
25
+ mel_spec_type,
26
+ nfe_step,
27
+ preprocess_ref_audio_text,
28
+ remove_silence_for_generated_wav,
29
+ speed,
30
+ sway_sampling_coef,
31
+ target_rms,
32
+ )
33
+
34
+
35
+ parser = argparse.ArgumentParser(
36
+ prog="python3 infer-cli.py",
37
+ description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
38
+ epilog="Specify options above to override one or more settings from config.",
39
+ )
40
+ parser.add_argument(
41
+ "-c",
42
+ "--config",
43
+ type=str,
44
+ default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
45
+ help="The configuration file, default see infer/examples/basic/basic.toml",
46
+ )
47
+
48
+
49
+ # Note. Not to provide default value here in order to read default from config file
50
+
51
+ parser.add_argument(
52
+ "-m",
53
+ "--model",
54
+ type=str,
55
+ help="The model name: F5TTS_v1_Base | F5TTS_Base | E2TTS_Base | etc.",
56
+ )
57
+ parser.add_argument(
58
+ "-mc",
59
+ "--model_cfg",
60
+ type=str,
61
+ help="The path to F5-TTS model config file .yaml",
62
+ )
63
+ parser.add_argument(
64
+ "-p",
65
+ "--ckpt_file",
66
+ type=str,
67
+ help="The path to model checkpoint .pt, leave blank to use default",
68
+ )
69
+ parser.add_argument(
70
+ "-v",
71
+ "--vocab_file",
72
+ type=str,
73
+ help="The path to vocab file .txt, leave blank to use default",
74
+ )
75
+ parser.add_argument(
76
+ "-r",
77
+ "--ref_audio",
78
+ type=str,
79
+ help="The reference audio file.",
80
+ )
81
+ parser.add_argument(
82
+ "-s",
83
+ "--ref_text",
84
+ type=str,
85
+ help="The transcript/subtitle for the reference audio",
86
+ )
87
+ parser.add_argument(
88
+ "-t",
89
+ "--gen_text",
90
+ type=str,
91
+ help="The text to make model synthesize a speech",
92
+ )
93
+ parser.add_argument(
94
+ "-f",
95
+ "--gen_file",
96
+ type=str,
97
+ help="The file with text to generate, will ignore --gen_text",
98
+ )
99
+ parser.add_argument(
100
+ "-o",
101
+ "--output_dir",
102
+ type=str,
103
+ help="The path to output folder",
104
+ )
105
+ parser.add_argument(
106
+ "-w",
107
+ "--output_file",
108
+ type=str,
109
+ help="The name of output file",
110
+ )
111
+ parser.add_argument(
112
+ "--save_chunk",
113
+ action="store_true",
114
+ help="To save each audio chunks during inference",
115
+ )
116
+ parser.add_argument(
117
+ "--no_legacy_text",
118
+ action="store_false",
119
+ help="Not to use lossy ASCII transliterations of unicode text in saved file names.",
120
+ )
121
+ parser.add_argument(
122
+ "--remove_silence",
123
+ action="store_true",
124
+ help="To remove long silence found in ouput",
125
+ )
126
+ parser.add_argument(
127
+ "--load_vocoder_from_local",
128
+ action="store_true",
129
+ help="To load vocoder from local dir, default to ../checkpoints/vocos-mel-24khz",
130
+ )
131
+ parser.add_argument(
132
+ "--vocoder_name",
133
+ type=str,
134
+ choices=["vocos", "bigvgan"],
135
+ help=f"Used vocoder name: vocos | bigvgan, default {mel_spec_type}",
136
+ )
137
+ parser.add_argument(
138
+ "--target_rms",
139
+ type=float,
140
+ help=f"Target output speech loudness normalization value, default {target_rms}",
141
+ )
142
+ parser.add_argument(
143
+ "--cross_fade_duration",
144
+ type=float,
145
+ help=f"Duration of cross-fade between audio segments in seconds, default {cross_fade_duration}",
146
+ )
147
+ parser.add_argument(
148
+ "--nfe_step",
149
+ type=int,
150
+ help=f"The number of function evaluation (denoising steps), default {nfe_step}",
151
+ )
152
+ parser.add_argument(
153
+ "--cfg_strength",
154
+ type=float,
155
+ help=f"Classifier-free guidance strength, default {cfg_strength}",
156
+ )
157
+ parser.add_argument(
158
+ "--sway_sampling_coef",
159
+ type=float,
160
+ help=f"Sway Sampling coefficient, default {sway_sampling_coef}",
161
+ )
162
+ parser.add_argument(
163
+ "--speed",
164
+ type=float,
165
+ help=f"The speed of the generated audio, default {speed}",
166
+ )
167
+ parser.add_argument(
168
+ "--fix_duration",
169
+ type=float,
170
+ help=f"Fix the total duration (ref and gen audios) in seconds, default {fix_duration}",
171
+ )
172
+ parser.add_argument(
173
+ "--device",
174
+ type=str,
175
+ help="Specify the device to run on",
176
+ )
177
+ args = parser.parse_args()
178
+
179
+
180
+ # config file
181
+
182
+ config = tomli.load(open(args.config, "rb"))
183
+
184
+
185
+ # command-line interface parameters
186
+
187
+ model = args.model or config.get("model", "F5TTS_v1_Base")
188
+ ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
189
+ vocab_file = args.vocab_file or config.get("vocab_file", "")
190
+
191
+ ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
192
+ ref_text = (
193
+ args.ref_text
194
+ if args.ref_text is not None
195
+ else config.get("ref_text", "Some call me nature, others call me mother nature.")
196
+ )
197
+ gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
198
+ gen_file = args.gen_file or config.get("gen_file", "")
199
+
200
+ output_dir = args.output_dir or config.get("output_dir", "tests")
201
+ output_file = args.output_file or config.get(
202
+ "output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav"
203
+ )
204
+
205
+ save_chunk = args.save_chunk or config.get("save_chunk", False)
206
+ use_legacy_text = args.no_legacy_text or config.get("no_legacy_text", False) # no_legacy_text is a store_false arg
207
+ if save_chunk and use_legacy_text:
208
+ print(
209
+ "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n"
210
+ )
211
+
212
+ remove_silence = args.remove_silence or config.get("remove_silence", False)
213
+ load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
214
+
215
+ vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
216
+ target_rms = args.target_rms or config.get("target_rms", target_rms)
217
+ cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
218
+ nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
219
+ cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
220
+ sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
221
+ speed = args.speed or config.get("speed", speed)
222
+ fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
223
+ device = args.device or config.get("device", device)
224
+
225
+
226
+ # patches for pip pkg user
227
+ if "infer/examples/" in ref_audio:
228
+ ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
229
+ if "infer/examples/" in gen_file:
230
+ gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
231
+ if "voices" in config:
232
+ for voice in config["voices"]:
233
+ voice_ref_audio = config["voices"][voice]["ref_audio"]
234
+ if "infer/examples/" in voice_ref_audio:
235
+ config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
236
+
237
+
238
+ # ignore gen_text if gen_file provided
239
+
240
+ if gen_file:
241
+ gen_text = codecs.open(gen_file, "r", "utf-8").read()
242
+
243
+
244
+ # output path
245
+
246
+ wave_path = Path(output_dir) / output_file
247
+ # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
248
+ if save_chunk:
249
+ output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
250
+ if not os.path.exists(output_chunk_dir):
251
+ os.makedirs(output_chunk_dir)
252
+
253
+
254
+ # load vocoder
255
+
256
+ if vocoder_name == "vocos":
257
+ vocoder_local_path = "../checkpoints/vocos-mel-24khz"
258
+ elif vocoder_name == "bigvgan":
259
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
260
+
261
+ vocoder = load_vocoder(
262
+ vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path, device=device
263
+ )
264
+
265
+
266
+ # load TTS model
267
+
268
+ model_cfg = OmegaConf.load(
269
+ args.model_cfg or config.get("model_cfg", str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
270
+ )
271
+ model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
272
+ model_arc = model_cfg.model.arch
273
+
274
+ repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
275
+
276
+ if model != "F5TTS_Base":
277
+ assert vocoder_name == model_cfg.model.mel_spec.mel_spec_type
278
+
279
+ # override for previous models
280
+ if model == "F5TTS_Base":
281
+ if vocoder_name == "vocos":
282
+ ckpt_step = 1200000
283
+ elif vocoder_name == "bigvgan":
284
+ model = "F5TTS_Base_bigvgan"
285
+ ckpt_type = "pt"
286
+ elif model == "E2TTS_Base":
287
+ repo_name = "E2-TTS"
288
+ ckpt_step = 1200000
289
+
290
+ if not ckpt_file:
291
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}"))
292
+
293
+ print(f"Using {model}...")
294
+ ema_model = load_model(
295
+ model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
296
+ )
297
+
298
+
299
+ # inference process
300
+
301
+
302
+ def main():
303
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
304
+ if "voices" not in config:
305
+ voices = {"main": main_voice}
306
+ else:
307
+ voices = config["voices"]
308
+ voices["main"] = main_voice
309
+ for voice in voices:
310
+ print("Voice:", voice)
311
+ print("ref_audio ", voices[voice]["ref_audio"])
312
+ voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
313
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
314
+ )
315
+ print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
316
+
317
+ generated_audio_segments = []
318
+ reg1 = r"(?=\[\w+\])"
319
+ chunks = re.split(reg1, gen_text)
320
+ reg2 = r"\[(\w+)\]"
321
+ for text in chunks:
322
+ if not text.strip():
323
+ continue
324
+ match = re.match(reg2, text)
325
+ if match:
326
+ voice = match[1]
327
+ else:
328
+ print("No voice tag found, using main.")
329
+ voice = "main"
330
+ if voice not in voices:
331
+ print(f"Voice {voice} not found, using main.")
332
+ voice = "main"
333
+ text = re.sub(reg2, "", text)
334
+ ref_audio_ = voices[voice]["ref_audio"]
335
+ ref_text_ = voices[voice]["ref_text"]
336
+ local_speed = voices[voice].get("speed", speed)
337
+ gen_text_ = text.strip()
338
+ print(f"Voice: {voice}")
339
+ audio_segment, final_sample_rate, spectrogram = infer_process(
340
+ ref_audio_,
341
+ ref_text_,
342
+ gen_text_,
343
+ ema_model,
344
+ vocoder,
345
+ mel_spec_type=vocoder_name,
346
+ target_rms=target_rms,
347
+ cross_fade_duration=cross_fade_duration,
348
+ nfe_step=nfe_step,
349
+ cfg_strength=cfg_strength,
350
+ sway_sampling_coef=sway_sampling_coef,
351
+ speed=local_speed,
352
+ fix_duration=fix_duration,
353
+ device=device,
354
+ )
355
+ generated_audio_segments.append(audio_segment)
356
+
357
+ if save_chunk:
358
+ if len(gen_text_) > 200:
359
+ gen_text_ = gen_text_[:200] + " ... "
360
+ if use_legacy_text:
361
+ gen_text_ = unidecode(gen_text_)
362
+ sf.write(
363
+ os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
364
+ audio_segment,
365
+ final_sample_rate,
366
+ )
367
+
368
+ if generated_audio_segments:
369
+ final_wave = np.concatenate(generated_audio_segments)
370
+
371
+ if not os.path.exists(output_dir):
372
+ os.makedirs(output_dir)
373
+
374
+ with open(wave_path, "wb") as f:
375
+ sf.write(f.name, final_wave, final_sample_rate)
376
+ # Remove silence
377
+ if remove_silence:
378
+ remove_silence_for_generated_wav(f.name)
379
+ print(f.name)
380
+
381
+
382
+ if __name__ == "__main__":
383
+ main()
F5-TTS/src/f5_tts/infer/infer_gradio.py ADDED
@@ -0,0 +1,1121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ # Above allows ruff to ignore E402: module level import not at top of file
3
+
4
+ import gc
5
+ import json
6
+ import os
7
+ import re
8
+ import tempfile
9
+ from collections import OrderedDict
10
+ from functools import lru_cache
11
+ from importlib.resources import files
12
+
13
+ import click
14
+ import gradio as gr
15
+ import numpy as np
16
+ import soundfile as sf
17
+ import torch
18
+ import torchaudio
19
+ from cached_path import cached_path
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+
23
+ try:
24
+ import spaces
25
+
26
+ USING_SPACES = True
27
+ except ImportError:
28
+ USING_SPACES = False
29
+
30
+
31
+ def gpu_decorator(func):
32
+ if USING_SPACES:
33
+ return spaces.GPU(func)
34
+ else:
35
+ return func
36
+
37
+
38
+ from f5_tts.infer.utils_infer import (
39
+ infer_process,
40
+ load_model,
41
+ load_vocoder,
42
+ preprocess_ref_audio_text,
43
+ remove_silence_for_generated_wav,
44
+ save_spectrogram,
45
+ tempfile_kwargs,
46
+ )
47
+ from f5_tts.model import DiT, UNetT
48
+
49
+
50
+ DEFAULT_TTS_MODEL = "F5-TTS_v1"
51
+ tts_model_choice = DEFAULT_TTS_MODEL
52
+
53
+ DEFAULT_TTS_MODEL_CFG = [
54
+ "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
55
+ "hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt",
56
+ json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
57
+ ]
58
+
59
+
60
+ # load models
61
+
62
+ vocoder = load_vocoder()
63
+
64
+
65
+ def load_f5tts():
66
+ ckpt_path = str(cached_path(DEFAULT_TTS_MODEL_CFG[0]))
67
+ F5TTS_model_cfg = json.loads(DEFAULT_TTS_MODEL_CFG[2])
68
+ return load_model(DiT, F5TTS_model_cfg, ckpt_path)
69
+
70
+
71
+ def load_e2tts():
72
+ ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
73
+ E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4, text_mask_padding=False, pe_attn_head=1)
74
+ return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
75
+
76
+
77
+ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
78
+ ckpt_path, vocab_path = ckpt_path.strip(), vocab_path.strip()
79
+ if ckpt_path.startswith("hf://"):
80
+ ckpt_path = str(cached_path(ckpt_path))
81
+ if vocab_path.startswith("hf://"):
82
+ vocab_path = str(cached_path(vocab_path))
83
+ if model_cfg is None:
84
+ model_cfg = json.loads(DEFAULT_TTS_MODEL_CFG[2])
85
+ elif isinstance(model_cfg, str):
86
+ model_cfg = json.loads(model_cfg)
87
+ return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
88
+
89
+
90
+ F5TTS_ema_model = load_f5tts()
91
+ E2TTS_ema_model = load_e2tts() if USING_SPACES else None
92
+ custom_ema_model, pre_custom_path = None, ""
93
+
94
+ chat_model_state = None
95
+ chat_tokenizer_state = None
96
+
97
+
98
+ @gpu_decorator
99
+ def chat_model_inference(messages, model, tokenizer):
100
+ """Generate response using Qwen"""
101
+ text = tokenizer.apply_chat_template(
102
+ messages,
103
+ tokenize=False,
104
+ add_generation_prompt=True,
105
+ )
106
+
107
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
108
+ generated_ids = model.generate(
109
+ **model_inputs,
110
+ max_new_tokens=512,
111
+ temperature=0.7,
112
+ top_p=0.95,
113
+ )
114
+
115
+ generated_ids = [
116
+ output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
117
+ ]
118
+ return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
119
+
120
+
121
+ @gpu_decorator
122
+ def load_text_from_file(file):
123
+ if file:
124
+ with open(file, "r", encoding="utf-8") as f:
125
+ text = f.read().strip()
126
+ else:
127
+ text = ""
128
+ return gr.update(value=text)
129
+
130
+
131
+ @lru_cache(maxsize=1000) # NOTE. need to ensure params of infer() hashable
132
+ @gpu_decorator
133
+ def infer(
134
+ ref_audio_orig,
135
+ ref_text,
136
+ gen_text,
137
+ model,
138
+ remove_silence,
139
+ seed,
140
+ cross_fade_duration=0.15,
141
+ nfe_step=32,
142
+ speed=1,
143
+ show_info=gr.Info,
144
+ ):
145
+ if not ref_audio_orig:
146
+ gr.Warning("Please provide reference audio.")
147
+ return gr.update(), gr.update(), ref_text
148
+
149
+ # Set inference seed
150
+ if seed < 0 or seed > 2**31 - 1:
151
+ gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
152
+ seed = np.random.randint(0, 2**31 - 1)
153
+ torch.manual_seed(seed)
154
+ used_seed = seed
155
+
156
+ if not gen_text.strip():
157
+ gr.Warning("Please enter text to generate or upload a text file.")
158
+ return gr.update(), gr.update(), ref_text
159
+
160
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
161
+
162
+ if model == DEFAULT_TTS_MODEL:
163
+ ema_model = F5TTS_ema_model
164
+ elif model == "E2-TTS":
165
+ global E2TTS_ema_model
166
+ if E2TTS_ema_model is None:
167
+ show_info("Loading E2-TTS model...")
168
+ E2TTS_ema_model = load_e2tts()
169
+ ema_model = E2TTS_ema_model
170
+ elif isinstance(model, tuple) and model[0] == "Custom":
171
+ assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
172
+ global custom_ema_model, pre_custom_path
173
+ if pre_custom_path != model[1]:
174
+ show_info("Loading Custom TTS model...")
175
+ custom_ema_model = load_custom(model[1], vocab_path=model[2], model_cfg=model[3])
176
+ pre_custom_path = model[1]
177
+ ema_model = custom_ema_model
178
+
179
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
180
+ ref_audio,
181
+ ref_text,
182
+ gen_text,
183
+ ema_model,
184
+ vocoder,
185
+ cross_fade_duration=cross_fade_duration,
186
+ nfe_step=nfe_step,
187
+ speed=speed,
188
+ show_info=show_info,
189
+ progress=gr.Progress(),
190
+ )
191
+
192
+ # Remove silence
193
+ if remove_silence:
194
+ with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
195
+ temp_path = f.name
196
+ try:
197
+ sf.write(temp_path, final_wave, final_sample_rate)
198
+ remove_silence_for_generated_wav(f.name)
199
+ final_wave, _ = torchaudio.load(f.name)
200
+ finally:
201
+ os.unlink(temp_path)
202
+ final_wave = final_wave.squeeze().cpu().numpy()
203
+
204
+ # Save the spectrogram
205
+ with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
206
+ spectrogram_path = tmp_spectrogram.name
207
+ save_spectrogram(combined_spectrogram, spectrogram_path)
208
+
209
+ return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
210
+
211
+
212
+ with gr.Blocks() as app_tts:
213
+ gr.Markdown("# Batched TTS")
214
+ ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
215
+ with gr.Row():
216
+ gen_text_input = gr.Textbox(
217
+ label="Text to Generate",
218
+ lines=10,
219
+ max_lines=40,
220
+ scale=4,
221
+ )
222
+ gen_text_file = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
223
+ generate_btn = gr.Button("Synthesize", variant="primary")
224
+ with gr.Accordion("Advanced Settings", open=False):
225
+ with gr.Row():
226
+ ref_text_input = gr.Textbox(
227
+ label="Reference Text",
228
+ info="Leave blank to automatically transcribe the reference audio. If you enter text or upload a file, it will override automatic transcription.",
229
+ lines=2,
230
+ scale=4,
231
+ )
232
+ ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1)
233
+ with gr.Row():
234
+ randomize_seed = gr.Checkbox(
235
+ label="Randomize Seed",
236
+ info="Check to use a random seed for each generation. Uncheck to use the seed specified.",
237
+ value=True,
238
+ scale=3,
239
+ )
240
+ seed_input = gr.Number(show_label=False, value=0, precision=0, scale=1)
241
+ with gr.Column(scale=4):
242
+ remove_silence = gr.Checkbox(
243
+ label="Remove Silences",
244
+ info="If undesired long silence(s) produced, turn on to automatically detect and crop.",
245
+ value=False,
246
+ )
247
+ speed_slider = gr.Slider(
248
+ label="Speed",
249
+ minimum=0.3,
250
+ maximum=2.0,
251
+ value=1.0,
252
+ step=0.1,
253
+ info="Adjust the speed of the audio.",
254
+ )
255
+ nfe_slider = gr.Slider(
256
+ label="NFE Steps",
257
+ minimum=4,
258
+ maximum=64,
259
+ value=32,
260
+ step=2,
261
+ info="Set the number of denoising steps.",
262
+ )
263
+ cross_fade_duration_slider = gr.Slider(
264
+ label="Cross-Fade Duration (s)",
265
+ minimum=0.0,
266
+ maximum=1.0,
267
+ value=0.15,
268
+ step=0.01,
269
+ info="Set the duration of the cross-fade between audio clips.",
270
+ )
271
+
272
+ audio_output = gr.Audio(label="Synthesized Audio")
273
+ spectrogram_output = gr.Image(label="Spectrogram")
274
+
275
+ @gpu_decorator
276
+ def basic_tts(
277
+ ref_audio_input,
278
+ ref_text_input,
279
+ gen_text_input,
280
+ remove_silence,
281
+ randomize_seed,
282
+ seed_input,
283
+ cross_fade_duration_slider,
284
+ nfe_slider,
285
+ speed_slider,
286
+ ):
287
+ if randomize_seed:
288
+ seed_input = np.random.randint(0, 2**31 - 1)
289
+
290
+ audio_out, spectrogram_path, ref_text_out, used_seed = infer(
291
+ ref_audio_input,
292
+ ref_text_input,
293
+ gen_text_input,
294
+ tts_model_choice,
295
+ remove_silence,
296
+ seed=seed_input,
297
+ cross_fade_duration=cross_fade_duration_slider,
298
+ nfe_step=nfe_slider,
299
+ speed=speed_slider,
300
+ )
301
+ return audio_out, spectrogram_path, ref_text_out, used_seed
302
+
303
+ gen_text_file.upload(
304
+ load_text_from_file,
305
+ inputs=[gen_text_file],
306
+ outputs=[gen_text_input],
307
+ )
308
+
309
+ ref_text_file.upload(
310
+ load_text_from_file,
311
+ inputs=[ref_text_file],
312
+ outputs=[ref_text_input],
313
+ )
314
+
315
+ ref_audio_input.clear(
316
+ lambda: [None, None],
317
+ None,
318
+ [ref_text_input, ref_text_file],
319
+ )
320
+
321
+ generate_btn.click(
322
+ basic_tts,
323
+ inputs=[
324
+ ref_audio_input,
325
+ ref_text_input,
326
+ gen_text_input,
327
+ remove_silence,
328
+ randomize_seed,
329
+ seed_input,
330
+ cross_fade_duration_slider,
331
+ nfe_slider,
332
+ speed_slider,
333
+ ],
334
+ outputs=[audio_output, spectrogram_output, ref_text_input, seed_input],
335
+ )
336
+
337
+
338
+ def parse_speechtypes_text(gen_text):
339
+ # Pattern to find {str} or {"name": str, "seed": int, "speed": float}
340
+ pattern = r"(\{.*?\})"
341
+
342
+ # Split the text by the pattern
343
+ tokens = re.split(pattern, gen_text)
344
+
345
+ segments = []
346
+
347
+ current_type_dict = {
348
+ "name": "Regular",
349
+ "seed": -1,
350
+ "speed": 1.0,
351
+ }
352
+
353
+ for i in range(len(tokens)):
354
+ if i % 2 == 0:
355
+ # This is text
356
+ text = tokens[i].strip()
357
+ if text:
358
+ current_type_dict["text"] = text
359
+ segments.append(current_type_dict)
360
+ else:
361
+ # This is type
362
+ type_str = tokens[i].strip()
363
+ try: # if type dict
364
+ current_type_dict = json.loads(type_str)
365
+ except json.decoder.JSONDecodeError:
366
+ type_str = type_str[1:-1] # remove brace {}
367
+ current_type_dict = {"name": type_str, "seed": -1, "speed": 1.0}
368
+
369
+ return segments
370
+
371
+
372
+ with gr.Blocks() as app_multistyle:
373
+ # New section for multistyle generation
374
+ gr.Markdown(
375
+ """
376
+ # Multiple Speech-Type Generation
377
+
378
+ This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, or upload a .txt file with the same format. The system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
379
+ """
380
+ )
381
+
382
+ with gr.Row():
383
+ gr.Markdown(
384
+ """
385
+ **Example Input:** <br>
386
+ {Regular} Hello, I'd like to order a sandwich please. <br>
387
+ {Surprised} What do you mean you're out of bread? <br>
388
+ {Sad} I really wanted a sandwich though... <br>
389
+ {Angry} You know what, darn you and your little shop! <br>
390
+ {Whisper} I'll just go back home and cry now. <br>
391
+ {Shouting} Why me?!
392
+ """
393
+ )
394
+
395
+ gr.Markdown(
396
+ """
397
+ **Example Input 2:** <br>
398
+ {"name": "Speaker1_Happy", "seed": -1, "speed": 1} Hello, I'd like to order a sandwich please. <br>
399
+ {"name": "Speaker2_Regular", "seed": -1, "speed": 1} Sorry, we're out of bread. <br>
400
+ {"name": "Speaker1_Sad", "seed": -1, "speed": 1} I really wanted a sandwich though... <br>
401
+ {"name": "Speaker2_Whisper", "seed": -1, "speed": 1} I'll give you the last one I was hiding.
402
+ """
403
+ )
404
+
405
+ gr.Markdown(
406
+ 'Upload different audio clips for each speech type. The first speech type is mandatory. You can add additional speech types by clicking the "Add Speech Type" button.'
407
+ )
408
+
409
+ # Regular speech type (mandatory)
410
+ with gr.Row(variant="compact") as regular_row:
411
+ with gr.Column(scale=1, min_width=160):
412
+ regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
413
+ regular_insert = gr.Button("Insert Label", variant="secondary")
414
+ with gr.Column(scale=3):
415
+ regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
416
+ with gr.Column(scale=3):
417
+ regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=4)
418
+ with gr.Row():
419
+ regular_seed_slider = gr.Slider(
420
+ show_label=False, minimum=-1, maximum=999, value=-1, step=1, info="Seed, -1 for random"
421
+ )
422
+ regular_speed_slider = gr.Slider(
423
+ show_label=False, minimum=0.3, maximum=2.0, value=1.0, step=0.1, info="Adjust the speed"
424
+ )
425
+ with gr.Column(scale=1, min_width=160):
426
+ regular_ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"])
427
+
428
+ # Regular speech type (max 100)
429
+ max_speech_types = 100
430
+ speech_type_rows = [regular_row]
431
+ speech_type_names = [regular_name]
432
+ speech_type_audios = [regular_audio]
433
+ speech_type_ref_texts = [regular_ref_text]
434
+ speech_type_ref_text_files = [regular_ref_text_file]
435
+ speech_type_seeds = [regular_seed_slider]
436
+ speech_type_speeds = [regular_speed_slider]
437
+ speech_type_delete_btns = [None]
438
+ speech_type_insert_btns = [regular_insert]
439
+
440
+ # Additional speech types (99 more)
441
+ for i in range(max_speech_types - 1):
442
+ with gr.Row(variant="compact", visible=False) as row:
443
+ with gr.Column(scale=1, min_width=160):
444
+ name_input = gr.Textbox(label="Speech Type Name")
445
+ insert_btn = gr.Button("Insert Label", variant="secondary")
446
+ delete_btn = gr.Button("Delete Type", variant="stop")
447
+ with gr.Column(scale=3):
448
+ audio_input = gr.Audio(label="Reference Audio", type="filepath")
449
+ with gr.Column(scale=3):
450
+ ref_text_input = gr.Textbox(label="Reference Text", lines=4)
451
+ with gr.Row():
452
+ seed_input = gr.Slider(
453
+ show_label=False, minimum=-1, maximum=999, value=-1, step=1, info="Seed. -1 for random"
454
+ )
455
+ speed_input = gr.Slider(
456
+ show_label=False, minimum=0.3, maximum=2.0, value=1.0, step=0.1, info="Adjust the speed"
457
+ )
458
+ with gr.Column(scale=1, min_width=160):
459
+ ref_text_file_input = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"])
460
+ speech_type_rows.append(row)
461
+ speech_type_names.append(name_input)
462
+ speech_type_audios.append(audio_input)
463
+ speech_type_ref_texts.append(ref_text_input)
464
+ speech_type_ref_text_files.append(ref_text_file_input)
465
+ speech_type_seeds.append(seed_input)
466
+ speech_type_speeds.append(speed_input)
467
+ speech_type_delete_btns.append(delete_btn)
468
+ speech_type_insert_btns.append(insert_btn)
469
+
470
+ # Global logic for all speech types
471
+ for i in range(max_speech_types):
472
+ speech_type_audios[i].clear(
473
+ lambda: [None, None],
474
+ None,
475
+ [speech_type_ref_texts[i], speech_type_ref_text_files[i]],
476
+ )
477
+ speech_type_ref_text_files[i].upload(
478
+ load_text_from_file,
479
+ inputs=[speech_type_ref_text_files[i]],
480
+ outputs=[speech_type_ref_texts[i]],
481
+ )
482
+
483
+ # Button to add speech type
484
+ add_speech_type_btn = gr.Button("Add Speech Type")
485
+
486
+ # Keep track of autoincrement of speech types, no roll back
487
+ speech_type_count = 1
488
+
489
+ # Function to add a speech type
490
+ def add_speech_type_fn():
491
+ row_updates = [gr.update() for _ in range(max_speech_types)]
492
+ global speech_type_count
493
+ if speech_type_count < max_speech_types:
494
+ row_updates[speech_type_count] = gr.update(visible=True)
495
+ speech_type_count += 1
496
+ else:
497
+ gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
498
+ return row_updates
499
+
500
+ add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
501
+
502
+ # Function to delete a speech type
503
+ def delete_speech_type_fn():
504
+ return gr.update(visible=False), None, None, None, None
505
+
506
+ # Update delete button clicks and ref text file changes
507
+ for i in range(1, len(speech_type_delete_btns)):
508
+ speech_type_delete_btns[i].click(
509
+ delete_speech_type_fn,
510
+ outputs=[
511
+ speech_type_rows[i],
512
+ speech_type_names[i],
513
+ speech_type_audios[i],
514
+ speech_type_ref_texts[i],
515
+ speech_type_ref_text_files[i],
516
+ ],
517
+ )
518
+
519
+ # Text input for the prompt
520
+ with gr.Row():
521
+ gen_text_input_multistyle = gr.Textbox(
522
+ label="Text to Generate",
523
+ lines=10,
524
+ max_lines=40,
525
+ scale=4,
526
+ placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
527
+ )
528
+ gen_text_file_multistyle = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
529
+
530
+ def make_insert_speech_type_fn(index):
531
+ def insert_speech_type_fn(current_text, speech_type_name, speech_type_seed, speech_type_speed):
532
+ current_text = current_text or ""
533
+ if not speech_type_name:
534
+ gr.Warning("Please enter speech type name before insert.")
535
+ return current_text
536
+ speech_type_dict = {
537
+ "name": speech_type_name,
538
+ "seed": speech_type_seed,
539
+ "speed": speech_type_speed,
540
+ }
541
+ updated_text = current_text + json.dumps(speech_type_dict) + " "
542
+ return updated_text
543
+
544
+ return insert_speech_type_fn
545
+
546
+ for i, insert_btn in enumerate(speech_type_insert_btns):
547
+ insert_fn = make_insert_speech_type_fn(i)
548
+ insert_btn.click(
549
+ insert_fn,
550
+ inputs=[gen_text_input_multistyle, speech_type_names[i], speech_type_seeds[i], speech_type_speeds[i]],
551
+ outputs=gen_text_input_multistyle,
552
+ )
553
+
554
+ with gr.Accordion("Advanced Settings", open=True):
555
+ with gr.Row():
556
+ with gr.Column():
557
+ show_cherrypick_multistyle = gr.Checkbox(
558
+ label="Show Cherry-pick Interface",
559
+ info="Turn on to show interface, picking seeds from previous generations.",
560
+ value=False,
561
+ )
562
+ with gr.Column():
563
+ remove_silence_multistyle = gr.Checkbox(
564
+ label="Remove Silences",
565
+ info="Turn on to automatically detect and crop long silences.",
566
+ value=True,
567
+ )
568
+
569
+ # Generate button
570
+ generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
571
+
572
+ # Output audio
573
+ audio_output_multistyle = gr.Audio(label="Synthesized Audio")
574
+
575
+ # Used seed gallery
576
+ cherrypick_interface_multistyle = gr.Textbox(
577
+ label="Cherry-pick Interface",
578
+ lines=10,
579
+ max_lines=40,
580
+ show_copy_button=True,
581
+ interactive=False,
582
+ visible=False,
583
+ )
584
+
585
+ # Logic control to show/hide the cherrypick interface
586
+ show_cherrypick_multistyle.change(
587
+ lambda is_visible: gr.update(visible=is_visible),
588
+ show_cherrypick_multistyle,
589
+ cherrypick_interface_multistyle,
590
+ )
591
+
592
+ # Function to load text to generate from file
593
+ gen_text_file_multistyle.upload(
594
+ load_text_from_file,
595
+ inputs=[gen_text_file_multistyle],
596
+ outputs=[gen_text_input_multistyle],
597
+ )
598
+
599
+ @gpu_decorator
600
+ def generate_multistyle_speech(
601
+ gen_text,
602
+ *args,
603
+ ):
604
+ speech_type_names_list = args[:max_speech_types]
605
+ speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
606
+ speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
607
+ remove_silence = args[3 * max_speech_types]
608
+ # Collect the speech types and their audios into a dict
609
+ speech_types = OrderedDict()
610
+
611
+ ref_text_idx = 0
612
+ for name_input, audio_input, ref_text_input in zip(
613
+ speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
614
+ ):
615
+ if name_input and audio_input:
616
+ speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
617
+ else:
618
+ speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
619
+ ref_text_idx += 1
620
+
621
+ # Parse the gen_text into segments
622
+ segments = parse_speechtypes_text(gen_text)
623
+
624
+ # For each segment, generate speech
625
+ generated_audio_segments = []
626
+ current_type_name = "Regular"
627
+ inference_meta_data = ""
628
+
629
+ for segment in segments:
630
+ name = segment["name"]
631
+ seed_input = segment["seed"]
632
+ speed = segment["speed"]
633
+ text = segment["text"]
634
+
635
+ if name in speech_types:
636
+ current_type_name = name
637
+ else:
638
+ gr.Warning(f"Type {name} is not available, will use Regular as default.")
639
+ current_type_name = "Regular"
640
+
641
+ try:
642
+ ref_audio = speech_types[current_type_name]["audio"]
643
+ except KeyError:
644
+ gr.Warning(f"Please provide reference audio for type {current_type_name}.")
645
+ return [None] + [speech_types[name]["ref_text"] for name in speech_types] + [None]
646
+ ref_text = speech_types[current_type_name].get("ref_text", "")
647
+
648
+ if seed_input == -1:
649
+ seed_input = np.random.randint(0, 2**31 - 1)
650
+
651
+ # Generate or retrieve speech for this segment
652
+ audio_out, _, ref_text_out, used_seed = infer(
653
+ ref_audio,
654
+ ref_text,
655
+ text,
656
+ tts_model_choice,
657
+ remove_silence,
658
+ seed=seed_input,
659
+ cross_fade_duration=0,
660
+ speed=speed,
661
+ show_info=print, # no pull to top when generating
662
+ )
663
+ sr, audio_data = audio_out
664
+
665
+ generated_audio_segments.append(audio_data)
666
+ speech_types[current_type_name]["ref_text"] = ref_text_out
667
+ inference_meta_data += json.dumps(dict(name=name, seed=used_seed, speed=speed)) + f" {text}\n"
668
+
669
+ # Concatenate all audio segments
670
+ if generated_audio_segments:
671
+ final_audio_data = np.concatenate(generated_audio_segments)
672
+ return (
673
+ [(sr, final_audio_data)]
674
+ + [speech_types[name]["ref_text"] for name in speech_types]
675
+ + [inference_meta_data]
676
+ )
677
+ else:
678
+ gr.Warning("No audio generated.")
679
+ return [None] + [speech_types[name]["ref_text"] for name in speech_types] + [None]
680
+
681
+ generate_multistyle_btn.click(
682
+ generate_multistyle_speech,
683
+ inputs=[
684
+ gen_text_input_multistyle,
685
+ ]
686
+ + speech_type_names
687
+ + speech_type_audios
688
+ + speech_type_ref_texts
689
+ + [
690
+ remove_silence_multistyle,
691
+ ],
692
+ outputs=[audio_output_multistyle] + speech_type_ref_texts + [cherrypick_interface_multistyle],
693
+ )
694
+
695
+ # Validation function to disable Generate button if speech types are missing
696
+ def validate_speech_types(gen_text, regular_name, *args):
697
+ speech_type_names_list = args
698
+
699
+ # Collect the speech types names
700
+ speech_types_available = set()
701
+ if regular_name:
702
+ speech_types_available.add(regular_name)
703
+ for name_input in speech_type_names_list:
704
+ if name_input:
705
+ speech_types_available.add(name_input)
706
+
707
+ # Parse the gen_text to get the speech types used
708
+ segments = parse_speechtypes_text(gen_text)
709
+ speech_types_in_text = set(segment["name"] for segment in segments)
710
+
711
+ # Check if all speech types in text are available
712
+ missing_speech_types = speech_types_in_text - speech_types_available
713
+
714
+ if missing_speech_types:
715
+ # Disable the generate button
716
+ return gr.update(interactive=False)
717
+ else:
718
+ # Enable the generate button
719
+ return gr.update(interactive=True)
720
+
721
+ gen_text_input_multistyle.change(
722
+ validate_speech_types,
723
+ inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
724
+ outputs=generate_multistyle_btn,
725
+ )
726
+
727
+
728
+ with gr.Blocks() as app_chat:
729
+ gr.Markdown(
730
+ """
731
+ # Voice Chat
732
+ Have a conversation with an AI using your reference voice!
733
+ 1. Upload a reference audio clip and optionally its transcript (via text or .txt file).
734
+ 2. Load the chat model.
735
+ 3. Record your message through your microphone or type it.
736
+ 4. The AI will respond using the reference voice.
737
+ """
738
+ )
739
+
740
+ chat_model_name_list = [
741
+ "Qwen/Qwen2.5-3B-Instruct",
742
+ "microsoft/Phi-4-mini-instruct",
743
+ ]
744
+
745
+ @gpu_decorator
746
+ def load_chat_model(chat_model_name):
747
+ show_info = gr.Info
748
+ global chat_model_state, chat_tokenizer_state
749
+ if chat_model_state is not None:
750
+ chat_model_state = None
751
+ chat_tokenizer_state = None
752
+ gc.collect()
753
+ torch.cuda.empty_cache()
754
+
755
+ show_info(f"Loading chat model: {chat_model_name}")
756
+ chat_model_state = AutoModelForCausalLM.from_pretrained(chat_model_name, torch_dtype="auto", device_map="auto")
757
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(chat_model_name)
758
+ show_info(f"Chat model {chat_model_name} loaded successfully!")
759
+
760
+ return gr.update(visible=False), gr.update(visible=True)
761
+
762
+ if USING_SPACES:
763
+ load_chat_model(chat_model_name_list[0])
764
+
765
+ chat_model_name_input = gr.Dropdown(
766
+ choices=chat_model_name_list,
767
+ value=chat_model_name_list[0],
768
+ label="Chat Model Name",
769
+ info="Enter the name of a HuggingFace chat model",
770
+ allow_custom_value=not USING_SPACES,
771
+ )
772
+ load_chat_model_btn = gr.Button("Load Chat Model", variant="primary", visible=not USING_SPACES)
773
+ chat_interface_container = gr.Column(visible=USING_SPACES)
774
+
775
+ chat_model_name_input.change(
776
+ lambda: gr.update(visible=True),
777
+ None,
778
+ load_chat_model_btn,
779
+ show_progress="hidden",
780
+ )
781
+ load_chat_model_btn.click(
782
+ load_chat_model, inputs=[chat_model_name_input], outputs=[load_chat_model_btn, chat_interface_container]
783
+ )
784
+
785
+ with chat_interface_container:
786
+ with gr.Row():
787
+ with gr.Column():
788
+ ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
789
+ with gr.Column():
790
+ with gr.Accordion("Advanced Settings", open=False):
791
+ with gr.Row():
792
+ ref_text_chat = gr.Textbox(
793
+ label="Reference Text",
794
+ info="Optional: Leave blank to auto-transcribe",
795
+ lines=2,
796
+ scale=3,
797
+ )
798
+ ref_text_file_chat = gr.File(
799
+ label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1
800
+ )
801
+ with gr.Row():
802
+ randomize_seed_chat = gr.Checkbox(
803
+ label="Randomize Seed",
804
+ value=True,
805
+ info="Uncheck to use the seed specified.",
806
+ scale=3,
807
+ )
808
+ seed_input_chat = gr.Number(show_label=False, value=0, precision=0, scale=1)
809
+ remove_silence_chat = gr.Checkbox(
810
+ label="Remove Silences",
811
+ value=True,
812
+ )
813
+ system_prompt_chat = gr.Textbox(
814
+ label="System Prompt",
815
+ value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
816
+ lines=2,
817
+ )
818
+
819
+ chatbot_interface = gr.Chatbot(label="Conversation", type="messages")
820
+
821
+ with gr.Row():
822
+ with gr.Column():
823
+ audio_input_chat = gr.Microphone(
824
+ label="Speak your message",
825
+ type="filepath",
826
+ )
827
+ audio_output_chat = gr.Audio(autoplay=True)
828
+ with gr.Column():
829
+ text_input_chat = gr.Textbox(
830
+ label="Type your message",
831
+ lines=1,
832
+ )
833
+ send_btn_chat = gr.Button("Send Message")
834
+ clear_btn_chat = gr.Button("Clear Conversation")
835
+
836
+ # Modify process_audio_input to generate user input
837
+ @gpu_decorator
838
+ def process_audio_input(conv_state, audio_path, text):
839
+ """Handle audio or text input from user"""
840
+
841
+ if not audio_path and not text.strip():
842
+ return conv_state
843
+
844
+ if audio_path:
845
+ text = preprocess_ref_audio_text(audio_path, text)[1]
846
+ if not text.strip():
847
+ return conv_state
848
+
849
+ conv_state.append({"role": "user", "content": text})
850
+ return conv_state
851
+
852
+ # Use model and tokenizer from state to get text response
853
+ @gpu_decorator
854
+ def generate_text_response(conv_state, system_prompt):
855
+ """Generate text response from AI"""
856
+
857
+ system_prompt_state = [{"role": "system", "content": system_prompt}]
858
+ response = chat_model_inference(system_prompt_state + conv_state, chat_model_state, chat_tokenizer_state)
859
+
860
+ conv_state.append({"role": "assistant", "content": response})
861
+ return conv_state
862
+
863
+ @gpu_decorator
864
+ def generate_audio_response(conv_state, ref_audio, ref_text, remove_silence, randomize_seed, seed_input):
865
+ """Generate TTS audio for AI response"""
866
+ if not conv_state or not ref_audio:
867
+ return None, ref_text, seed_input
868
+
869
+ last_ai_response = conv_state[-1]["content"]
870
+ if not last_ai_response or conv_state[-1]["role"] != "assistant":
871
+ return None, ref_text, seed_input
872
+
873
+ if randomize_seed:
874
+ seed_input = np.random.randint(0, 2**31 - 1)
875
+
876
+ audio_result, _, ref_text_out, used_seed = infer(
877
+ ref_audio,
878
+ ref_text,
879
+ last_ai_response,
880
+ tts_model_choice,
881
+ remove_silence,
882
+ seed=seed_input,
883
+ cross_fade_duration=0.15,
884
+ speed=1.0,
885
+ show_info=print, # show_info=print no pull to top when generating
886
+ )
887
+ return audio_result, ref_text_out, used_seed
888
+
889
+ def clear_conversation():
890
+ """Reset the conversation"""
891
+ return [], None
892
+
893
+ ref_text_file_chat.upload(
894
+ load_text_from_file,
895
+ inputs=[ref_text_file_chat],
896
+ outputs=[ref_text_chat],
897
+ )
898
+
899
+ for user_operation in [audio_input_chat.stop_recording, text_input_chat.submit, send_btn_chat.click]:
900
+ user_operation(
901
+ process_audio_input,
902
+ inputs=[chatbot_interface, audio_input_chat, text_input_chat],
903
+ outputs=[chatbot_interface],
904
+ ).then(
905
+ generate_text_response,
906
+ inputs=[chatbot_interface, system_prompt_chat],
907
+ outputs=[chatbot_interface],
908
+ ).then(
909
+ generate_audio_response,
910
+ inputs=[
911
+ chatbot_interface,
912
+ ref_audio_chat,
913
+ ref_text_chat,
914
+ remove_silence_chat,
915
+ randomize_seed_chat,
916
+ seed_input_chat,
917
+ ],
918
+ outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
919
+ ).then(
920
+ lambda: [None, None],
921
+ None,
922
+ [audio_input_chat, text_input_chat],
923
+ )
924
+
925
+ # Handle clear button or system prompt change and reset conversation
926
+ for user_operation in [clear_btn_chat.click, system_prompt_chat.change, chatbot_interface.clear]:
927
+ user_operation(
928
+ clear_conversation,
929
+ outputs=[chatbot_interface, audio_output_chat],
930
+ )
931
+
932
+
933
+ with gr.Blocks() as app_credits:
934
+ gr.Markdown("""
935
+ # Credits
936
+
937
+ * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
938
+ * [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
939
+ * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
940
+ """)
941
+
942
+
943
+ with gr.Blocks() as app:
944
+ gr.Markdown(
945
+ f"""
946
+ # E2/F5 TTS
947
+
948
+ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not USING_SPACES else "an online demo for [F5-TTS](https://github.com/SWivid/F5-TTS)"} with advanced batch processing support. This app supports the following TTS models:
949
+
950
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
951
+ * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
952
+
953
+ The checkpoints currently support English and Chinese.
954
+
955
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
956
+
957
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
958
+ """
959
+ )
960
+
961
+ last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom_model_info_v1.txt")
962
+
963
+ def load_last_used_custom():
964
+ try:
965
+ custom = []
966
+ with open(last_used_custom, "r", encoding="utf-8") as f:
967
+ for line in f:
968
+ custom.append(line.strip())
969
+ return custom
970
+ except FileNotFoundError:
971
+ last_used_custom.parent.mkdir(parents=True, exist_ok=True)
972
+ return DEFAULT_TTS_MODEL_CFG
973
+
974
+ def switch_tts_model(new_choice):
975
+ global tts_model_choice
976
+ if new_choice == "Custom": # override in case webpage is refreshed
977
+ custom_ckpt_path, custom_vocab_path, custom_model_cfg = load_last_used_custom()
978
+ tts_model_choice = ("Custom", custom_ckpt_path, custom_vocab_path, custom_model_cfg)
979
+ return (
980
+ gr.update(visible=True, value=custom_ckpt_path),
981
+ gr.update(visible=True, value=custom_vocab_path),
982
+ gr.update(visible=True, value=custom_model_cfg),
983
+ )
984
+ else:
985
+ tts_model_choice = new_choice
986
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
987
+
988
+ def set_custom_model(custom_ckpt_path, custom_vocab_path, custom_model_cfg):
989
+ global tts_model_choice
990
+ tts_model_choice = ("Custom", custom_ckpt_path, custom_vocab_path, custom_model_cfg)
991
+ with open(last_used_custom, "w", encoding="utf-8") as f:
992
+ f.write(custom_ckpt_path + "\n" + custom_vocab_path + "\n" + custom_model_cfg + "\n")
993
+
994
+ with gr.Row():
995
+ if not USING_SPACES:
996
+ choose_tts_model = gr.Radio(
997
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
998
+ )
999
+ else:
1000
+ choose_tts_model = gr.Radio(
1001
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
1002
+ )
1003
+ custom_ckpt_path = gr.Dropdown(
1004
+ choices=[DEFAULT_TTS_MODEL_CFG[0]],
1005
+ value=load_last_used_custom()[0],
1006
+ allow_custom_value=True,
1007
+ label="Model: local_path | hf://user_id/repo_id/model_ckpt",
1008
+ visible=False,
1009
+ )
1010
+ custom_vocab_path = gr.Dropdown(
1011
+ choices=[DEFAULT_TTS_MODEL_CFG[1]],
1012
+ value=load_last_used_custom()[1],
1013
+ allow_custom_value=True,
1014
+ label="Vocab: local_path | hf://user_id/repo_id/vocab_file",
1015
+ visible=False,
1016
+ )
1017
+ custom_model_cfg = gr.Dropdown(
1018
+ choices=[
1019
+ DEFAULT_TTS_MODEL_CFG[2],
1020
+ json.dumps(
1021
+ dict(
1022
+ dim=1024,
1023
+ depth=22,
1024
+ heads=16,
1025
+ ff_mult=2,
1026
+ text_dim=512,
1027
+ text_mask_padding=False,
1028
+ conv_layers=4,
1029
+ pe_attn_head=1,
1030
+ )
1031
+ ),
1032
+ json.dumps(
1033
+ dict(
1034
+ dim=768,
1035
+ depth=18,
1036
+ heads=12,
1037
+ ff_mult=2,
1038
+ text_dim=512,
1039
+ text_mask_padding=False,
1040
+ conv_layers=4,
1041
+ pe_attn_head=1,
1042
+ )
1043
+ ),
1044
+ ],
1045
+ value=load_last_used_custom()[2],
1046
+ allow_custom_value=True,
1047
+ label="Config: in a dictionary form",
1048
+ visible=False,
1049
+ )
1050
+
1051
+ choose_tts_model.change(
1052
+ switch_tts_model,
1053
+ inputs=[choose_tts_model],
1054
+ outputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
1055
+ show_progress="hidden",
1056
+ )
1057
+ custom_ckpt_path.change(
1058
+ set_custom_model,
1059
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
1060
+ show_progress="hidden",
1061
+ )
1062
+ custom_vocab_path.change(
1063
+ set_custom_model,
1064
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
1065
+ show_progress="hidden",
1066
+ )
1067
+ custom_model_cfg.change(
1068
+ set_custom_model,
1069
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
1070
+ show_progress="hidden",
1071
+ )
1072
+
1073
+ gr.TabbedInterface(
1074
+ [app_tts, app_multistyle, app_chat, app_credits],
1075
+ ["Basic-TTS", "Multi-Speech", "Voice-Chat", "Credits"],
1076
+ )
1077
+
1078
+
1079
+ @click.command()
1080
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
1081
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
1082
+ @click.option(
1083
+ "--share",
1084
+ "-s",
1085
+ default=False,
1086
+ is_flag=True,
1087
+ help="Share the app via Gradio share link",
1088
+ )
1089
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
1090
+ @click.option(
1091
+ "--root_path",
1092
+ "-r",
1093
+ default=None,
1094
+ type=str,
1095
+ help='The root path (or "mount point") of the application, if it\'s not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy that forwards requests to the application, e.g. set "/myapp" or full URL for application served at "https://example.com/myapp".',
1096
+ )
1097
+ @click.option(
1098
+ "--inbrowser",
1099
+ "-i",
1100
+ is_flag=True,
1101
+ default=False,
1102
+ help="Automatically launch the interface in the default web browser",
1103
+ )
1104
+ def main(port, host, share, api, root_path, inbrowser):
1105
+ global app
1106
+ print("Starting app...")
1107
+ app.queue(api_open=api).launch(
1108
+ server_name=host,
1109
+ server_port=port,
1110
+ share=share,
1111
+ show_api=api,
1112
+ root_path=root_path,
1113
+ inbrowser=inbrowser,
1114
+ )
1115
+
1116
+
1117
+ if __name__ == "__main__":
1118
+ if not USING_SPACES:
1119
+ main()
1120
+ else:
1121
+ app.queue().launch()
F5-TTS/src/f5_tts/infer/speech_edit.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
5
+
6
+ from importlib.resources import files
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torchaudio
11
+ from cached_path import cached_path
12
+ from hydra.utils import get_class
13
+ from omegaconf import OmegaConf
14
+
15
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
16
+ from f5_tts.model import CFM
17
+ from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
18
+
19
+
20
+ device = (
21
+ "cuda"
22
+ if torch.cuda.is_available()
23
+ else "xpu"
24
+ if torch.xpu.is_available()
25
+ else "mps"
26
+ if torch.backends.mps.is_available()
27
+ else "cpu"
28
+ )
29
+
30
+
31
+ # ---------------------- infer setting ---------------------- #
32
+
33
+ seed = None # int | None
34
+
35
+ exp_name = "F5TTS_v1_Base" # F5TTS_v1_Base | E2TTS_Base
36
+ ckpt_step = 1250000
37
+
38
+ nfe_step = 32 # 16, 32
39
+ cfg_strength = 2.0
40
+ ode_method = "euler" # euler | midpoint
41
+ sway_sampling_coef = -1.0
42
+ speed = 1.0
43
+ target_rms = 0.1
44
+
45
+
46
+ model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml")))
47
+ model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
48
+ model_arc = model_cfg.model.arch
49
+
50
+ dataset_name = model_cfg.datasets.name
51
+ tokenizer = model_cfg.model.tokenizer
52
+
53
+ mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
54
+ target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
55
+ n_mel_channels = model_cfg.model.mel_spec.n_mel_channels
56
+ hop_length = model_cfg.model.mel_spec.hop_length
57
+ win_length = model_cfg.model.mel_spec.win_length
58
+ n_fft = model_cfg.model.mel_spec.n_fft
59
+
60
+
61
+ # ckpt_path = str(files("f5_tts").joinpath("../../")) + f"/ckpts/{exp_name}/model_{ckpt_step}.safetensors"
62
+ ckpt_path = str(cached_path(f"hf://SWivid/F5-TTS/{exp_name}/model_{ckpt_step}.safetensors"))
63
+ output_dir = "tests"
64
+
65
+
66
+ # [leverage https://github.com/MahmoudAshraf97/ctc-forced-aligner to get char level alignment]
67
+ # pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
68
+ # [write the origin_text into a file, e.g. tests/test_edit.txt]
69
+ # ctc-forced-aligner --audio_path "src/f5_tts/infer/examples/basic/basic_ref_en.wav" --text_path "tests/test_edit.txt" --language "zho" --romanize --split_size "char"
70
+ # [result will be saved at same path of audio file]
71
+ # [--language "zho" for Chinese, "eng" for English]
72
+ # [if local ckpt, set --alignment_model "../checkpoints/mms-300m-1130-forced-aligner"]
73
+
74
+ audio_to_edit = str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav"))
75
+ origin_text = "Some call me nature, others call me mother nature."
76
+ target_text = "Some call me optimist, others call me realist."
77
+ parts_to_edit = [
78
+ [1.42, 2.44],
79
+ [4.04, 4.9],
80
+ ] # stard_ends of "nature" & "mother nature", in seconds
81
+ fix_duration = [
82
+ 1.2,
83
+ 1,
84
+ ] # fix duration for "optimist" & "realist", in seconds
85
+
86
+ # audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_zh.wav"
87
+ # origin_text = "对,这就是我,万人敬仰的太乙真人。"
88
+ # target_text = "对,那就是你,万人敬仰的太白金星。"
89
+ # parts_to_edit = [[0.84, 1.4], [1.92, 2.4], [4.26, 6.26], ]
90
+ # fix_duration = None # use origin text duration
91
+
92
+
93
+ # -------------------------------------------------#
94
+
95
+ use_ema = True
96
+
97
+ if not os.path.exists(output_dir):
98
+ os.makedirs(output_dir)
99
+
100
+ # Vocoder model
101
+ local = False
102
+ if mel_spec_type == "vocos":
103
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
104
+ elif mel_spec_type == "bigvgan":
105
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
106
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
107
+
108
+ # Tokenizer
109
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
110
+
111
+ # Model
112
+ model = CFM(
113
+ transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
114
+ mel_spec_kwargs=dict(
115
+ n_fft=n_fft,
116
+ hop_length=hop_length,
117
+ win_length=win_length,
118
+ n_mel_channels=n_mel_channels,
119
+ target_sample_rate=target_sample_rate,
120
+ mel_spec_type=mel_spec_type,
121
+ ),
122
+ odeint_kwargs=dict(
123
+ method=ode_method,
124
+ ),
125
+ vocab_char_map=vocab_char_map,
126
+ ).to(device)
127
+
128
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
129
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
130
+
131
+ # Audio
132
+ audio, sr = torchaudio.load(audio_to_edit)
133
+ if audio.shape[0] > 1:
134
+ audio = torch.mean(audio, dim=0, keepdim=True)
135
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
136
+ if rms < target_rms:
137
+ audio = audio * target_rms / rms
138
+ if sr != target_sample_rate:
139
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
140
+ audio = resampler(audio)
141
+ offset = 0
142
+ audio_ = torch.zeros(1, 0)
143
+ edit_mask = torch.zeros(1, 0, dtype=torch.bool)
144
+ for part in parts_to_edit:
145
+ start, end = part
146
+ part_dur = end - start if fix_duration is None else fix_duration.pop(0)
147
+ part_dur = part_dur * target_sample_rate
148
+ start = start * target_sample_rate
149
+ audio_ = torch.cat((audio_, audio[:, round(offset) : round(start)], torch.zeros(1, round(part_dur))), dim=-1)
150
+ edit_mask = torch.cat(
151
+ (
152
+ edit_mask,
153
+ torch.ones(1, round((start - offset) / hop_length), dtype=torch.bool),
154
+ torch.zeros(1, round(part_dur / hop_length), dtype=torch.bool),
155
+ ),
156
+ dim=-1,
157
+ )
158
+ offset = end * target_sample_rate
159
+ audio = torch.cat((audio_, audio[:, round(offset) :]), dim=-1)
160
+ edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True)
161
+ audio = audio.to(device)
162
+ edit_mask = edit_mask.to(device)
163
+
164
+ # Text
165
+ text_list = [target_text]
166
+ if tokenizer == "pinyin":
167
+ final_text_list = convert_char_to_pinyin(text_list)
168
+ else:
169
+ final_text_list = [text_list]
170
+ print(f"text : {text_list}")
171
+ print(f"pinyin: {final_text_list}")
172
+
173
+ # Duration
174
+ ref_audio_len = 0
175
+ duration = audio.shape[-1] // hop_length
176
+
177
+ # Inference
178
+ with torch.inference_mode():
179
+ generated, trajectory = model.sample(
180
+ cond=audio,
181
+ text=final_text_list,
182
+ duration=duration,
183
+ steps=nfe_step,
184
+ cfg_strength=cfg_strength,
185
+ sway_sampling_coef=sway_sampling_coef,
186
+ seed=seed,
187
+ edit_mask=edit_mask,
188
+ )
189
+ print(f"Generated mel: {generated.shape}")
190
+
191
+ # Final result
192
+ generated = generated.to(torch.float32)
193
+ generated = generated[:, ref_audio_len:, :]
194
+ gen_mel_spec = generated.permute(0, 2, 1)
195
+ if mel_spec_type == "vocos":
196
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
197
+ elif mel_spec_type == "bigvgan":
198
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
199
+
200
+ if rms < target_rms:
201
+ generated_wave = generated_wave * rms / target_rms
202
+
203
+ save_spectrogram(gen_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
204
+ torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave, target_sample_rate)
205
+ print(f"Generated wav: {generated_wave.shape}")
F5-TTS/src/f5_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+ from concurrent.futures import ThreadPoolExecutor
6
+
7
+
8
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
9
+ sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
10
+
11
+ import hashlib
12
+ import re
13
+ import tempfile
14
+ from importlib.resources import files
15
+
16
+ import matplotlib
17
+
18
+
19
+ matplotlib.use("Agg")
20
+
21
+ import matplotlib.pylab as plt
22
+ import numpy as np
23
+ import torch
24
+ import torchaudio
25
+ import tqdm
26
+ from huggingface_hub import hf_hub_download
27
+ from pydub import AudioSegment, silence
28
+ from transformers import pipeline
29
+ from vocos import Vocos
30
+
31
+ from f5_tts.model import CFM
32
+ from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
33
+
34
+
35
+ _ref_audio_cache = {}
36
+ _ref_text_cache = {}
37
+
38
+ device = (
39
+ "cuda"
40
+ if torch.cuda.is_available()
41
+ else "xpu"
42
+ if torch.xpu.is_available()
43
+ else "mps"
44
+ if torch.backends.mps.is_available()
45
+ else "cpu"
46
+ )
47
+
48
+ tempfile_kwargs = {"delete_on_close": False} if sys.version_info >= (3, 12) else {"delete": False}
49
+
50
+ # -----------------------------------------
51
+
52
+ target_sample_rate = 24000
53
+ n_mel_channels = 100
54
+ hop_length = 256
55
+ win_length = 1024
56
+ n_fft = 1024
57
+ mel_spec_type = "vocos"
58
+ target_rms = 0.1
59
+ cross_fade_duration = 0.15
60
+ ode_method = "euler"
61
+ nfe_step = 32 # 16, 32
62
+ cfg_strength = 2.0
63
+ sway_sampling_coef = -1.0
64
+ speed = 1.0
65
+ fix_duration = None
66
+
67
+ # -----------------------------------------
68
+
69
+
70
+ # chunk text into smaller pieces
71
+
72
+
73
+ def chunk_text(text, max_chars=135):
74
+ """
75
+ Splits the input text into chunks, each with a maximum number of characters.
76
+
77
+ Args:
78
+ text (str): The text to be split.
79
+ max_chars (int): The maximum number of characters per chunk.
80
+
81
+ Returns:
82
+ List[str]: A list of text chunks.
83
+ """
84
+ chunks = []
85
+ current_chunk = ""
86
+ # Split the text into sentences based on punctuation followed by whitespace
87
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
88
+
89
+ for sentence in sentences:
90
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
91
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
92
+ else:
93
+ if current_chunk:
94
+ chunks.append(current_chunk.strip())
95
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
96
+
97
+ if current_chunk:
98
+ chunks.append(current_chunk.strip())
99
+
100
+ return chunks
101
+
102
+
103
+ # load vocoder
104
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
105
+ if vocoder_name == "vocos":
106
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
107
+ if is_local:
108
+ print(f"Load vocos from local path {local_path}")
109
+ config_path = f"{local_path}/config.yaml"
110
+ model_path = f"{local_path}/pytorch_model.bin"
111
+ else:
112
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
113
+ repo_id = "charactr/vocos-mel-24khz"
114
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
115
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
116
+ vocoder = Vocos.from_hparams(config_path)
117
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
118
+ from vocos.feature_extractors import EncodecFeatures
119
+
120
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
121
+ encodec_parameters = {
122
+ "feature_extractor.encodec." + key: value
123
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
124
+ }
125
+ state_dict.update(encodec_parameters)
126
+ vocoder.load_state_dict(state_dict)
127
+ vocoder = vocoder.eval().to(device)
128
+ elif vocoder_name == "bigvgan":
129
+ try:
130
+ from third_party.BigVGAN import bigvgan
131
+ except ImportError:
132
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
133
+ if is_local:
134
+ # download generator from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main
135
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
136
+ else:
137
+ vocoder = bigvgan.BigVGAN.from_pretrained(
138
+ "nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False, cache_dir=hf_cache_dir
139
+ )
140
+
141
+ vocoder.remove_weight_norm()
142
+ vocoder = vocoder.eval().to(device)
143
+ return vocoder
144
+
145
+
146
+ # load asr pipeline
147
+
148
+ asr_pipe = None
149
+
150
+
151
+ def initialize_asr_pipeline(device: str = device, dtype=None):
152
+ if dtype is None:
153
+ dtype = (
154
+ torch.float16
155
+ if "cuda" in device
156
+ and torch.cuda.get_device_properties(device).major >= 7
157
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
158
+ else torch.float32
159
+ )
160
+ global asr_pipe
161
+ asr_pipe = pipeline(
162
+ "automatic-speech-recognition",
163
+ model="openai/whisper-large-v3-turbo",
164
+ torch_dtype=dtype,
165
+ device=device,
166
+ )
167
+
168
+
169
+ # transcribe
170
+
171
+
172
+ def transcribe(ref_audio, language=None):
173
+ global asr_pipe
174
+ if asr_pipe is None:
175
+ initialize_asr_pipeline(device=device)
176
+ return asr_pipe(
177
+ ref_audio,
178
+ chunk_length_s=30,
179
+ batch_size=128,
180
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
181
+ return_timestamps=False,
182
+ )["text"].strip()
183
+
184
+
185
+ # load model checkpoint for inference
186
+
187
+
188
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
189
+ if dtype is None:
190
+ dtype = (
191
+ torch.float16
192
+ if "cuda" in device
193
+ and torch.cuda.get_device_properties(device).major >= 7
194
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
195
+ else torch.float32
196
+ )
197
+ model = model.to(dtype)
198
+
199
+ ckpt_type = ckpt_path.split(".")[-1]
200
+ if ckpt_type == "safetensors":
201
+ from safetensors.torch import load_file
202
+
203
+ checkpoint = load_file(ckpt_path, device=device)
204
+ else:
205
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
206
+
207
+ if use_ema:
208
+ if ckpt_type == "safetensors":
209
+ checkpoint = {"ema_model_state_dict": checkpoint}
210
+ checkpoint["model_state_dict"] = {
211
+ k.replace("ema_model.", ""): v
212
+ for k, v in checkpoint["ema_model_state_dict"].items()
213
+ if k not in ["initted", "step"]
214
+ }
215
+
216
+ # patch for backward compatibility, 305e3ea
217
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
218
+ if key in checkpoint["model_state_dict"]:
219
+ del checkpoint["model_state_dict"][key]
220
+
221
+ model.load_state_dict(checkpoint["model_state_dict"])
222
+ else:
223
+ if ckpt_type == "safetensors":
224
+ checkpoint = {"model_state_dict": checkpoint}
225
+ model.load_state_dict(checkpoint["model_state_dict"])
226
+
227
+ del checkpoint
228
+ torch.cuda.empty_cache()
229
+
230
+ return model.to(device)
231
+
232
+
233
+ # load model for inference
234
+
235
+
236
+ def load_model(
237
+ model_cls,
238
+ model_cfg,
239
+ ckpt_path,
240
+ mel_spec_type=mel_spec_type,
241
+ vocab_file="",
242
+ ode_method=ode_method,
243
+ use_ema=True,
244
+ device=device,
245
+ ):
246
+ if vocab_file == "":
247
+ vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
248
+ tokenizer = "custom"
249
+
250
+ print("\nvocab : ", vocab_file)
251
+ print("token : ", tokenizer)
252
+ print("model : ", ckpt_path, "\n")
253
+
254
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
255
+ model = CFM(
256
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
257
+ mel_spec_kwargs=dict(
258
+ n_fft=n_fft,
259
+ hop_length=hop_length,
260
+ win_length=win_length,
261
+ n_mel_channels=n_mel_channels,
262
+ target_sample_rate=target_sample_rate,
263
+ mel_spec_type=mel_spec_type,
264
+ ),
265
+ odeint_kwargs=dict(
266
+ method=ode_method,
267
+ ),
268
+ vocab_char_map=vocab_char_map,
269
+ ).to(device)
270
+
271
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
272
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
273
+
274
+ return model
275
+
276
+
277
+ def remove_silence_edges(audio, silence_threshold=-42):
278
+ # Remove silence from the start
279
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
280
+ audio = audio[non_silent_start_idx:]
281
+
282
+ # Remove silence from the end
283
+ non_silent_end_duration = audio.duration_seconds
284
+ for ms in reversed(audio):
285
+ if ms.dBFS > silence_threshold:
286
+ break
287
+ non_silent_end_duration -= 0.001
288
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
289
+
290
+ return trimmed_audio
291
+
292
+
293
+ # preprocess reference audio and text
294
+
295
+
296
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
297
+ show_info("Converting audio...")
298
+
299
+ # Compute a hash of the reference audio file
300
+ with open(ref_audio_orig, "rb") as audio_file:
301
+ audio_data = audio_file.read()
302
+ audio_hash = hashlib.md5(audio_data).hexdigest()
303
+
304
+ global _ref_audio_cache
305
+
306
+ if audio_hash in _ref_audio_cache:
307
+ show_info("Using cached preprocessed reference audio...")
308
+ ref_audio = _ref_audio_cache[audio_hash]
309
+
310
+ else: # first pass, do preprocess
311
+ with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
312
+ temp_path = f.name
313
+
314
+ aseg = AudioSegment.from_file(ref_audio_orig)
315
+
316
+ # 1. try to find long silence for clipping
317
+ non_silent_segs = silence.split_on_silence(
318
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
319
+ )
320
+ non_silent_wave = AudioSegment.silent(duration=0)
321
+ for non_silent_seg in non_silent_segs:
322
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
323
+ show_info("Audio is over 12s, clipping short. (1)")
324
+ break
325
+ non_silent_wave += non_silent_seg
326
+
327
+ # 2. try to find short silence for clipping if 1. failed
328
+ if len(non_silent_wave) > 12000:
329
+ non_silent_segs = silence.split_on_silence(
330
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
331
+ )
332
+ non_silent_wave = AudioSegment.silent(duration=0)
333
+ for non_silent_seg in non_silent_segs:
334
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
335
+ show_info("Audio is over 12s, clipping short. (2)")
336
+ break
337
+ non_silent_wave += non_silent_seg
338
+
339
+ aseg = non_silent_wave
340
+
341
+ # 3. if no proper silence found for clipping
342
+ if len(aseg) > 12000:
343
+ aseg = aseg[:12000]
344
+ show_info("Audio is over 12s, clipping short. (3)")
345
+
346
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
347
+ aseg.export(temp_path, format="wav")
348
+ ref_audio = temp_path
349
+
350
+ # Cache the processed reference audio
351
+ _ref_audio_cache[audio_hash] = ref_audio
352
+
353
+ if not ref_text.strip():
354
+ global _ref_text_cache
355
+ if audio_hash in _ref_text_cache:
356
+ # Use cached asr transcription
357
+ show_info("Using cached reference text...")
358
+ ref_text = _ref_text_cache[audio_hash]
359
+ else:
360
+ show_info("No reference text provided, transcribing reference audio...")
361
+ ref_text = transcribe(ref_audio)
362
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
363
+ _ref_text_cache[audio_hash] = ref_text
364
+ else:
365
+ show_info("Using custom reference text...")
366
+
367
+ # Ensure ref_text ends with a proper sentence-ending punctuation
368
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
369
+ if ref_text.endswith("."):
370
+ ref_text += " "
371
+ else:
372
+ ref_text += ". "
373
+
374
+ print("\nref_text ", ref_text)
375
+
376
+ return ref_audio, ref_text
377
+
378
+
379
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
380
+
381
+
382
+ def infer_process(
383
+ ref_audio,
384
+ ref_text,
385
+ gen_text,
386
+ model_obj,
387
+ vocoder,
388
+ mel_spec_type=mel_spec_type,
389
+ show_info=print,
390
+ progress=tqdm,
391
+ target_rms=target_rms,
392
+ cross_fade_duration=cross_fade_duration,
393
+ nfe_step=nfe_step,
394
+ cfg_strength=cfg_strength,
395
+ sway_sampling_coef=sway_sampling_coef,
396
+ speed=speed,
397
+ fix_duration=fix_duration,
398
+ device=device,
399
+ ):
400
+ # Split the input text into batches
401
+ audio, sr = torchaudio.load(ref_audio)
402
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr) * speed)
403
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
404
+ for i, gen_text in enumerate(gen_text_batches):
405
+ print(f"gen_text {i}", gen_text)
406
+ print("\n")
407
+
408
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
409
+ return next(
410
+ infer_batch_process(
411
+ (audio, sr),
412
+ ref_text,
413
+ gen_text_batches,
414
+ model_obj,
415
+ vocoder,
416
+ mel_spec_type=mel_spec_type,
417
+ progress=progress,
418
+ target_rms=target_rms,
419
+ cross_fade_duration=cross_fade_duration,
420
+ nfe_step=nfe_step,
421
+ cfg_strength=cfg_strength,
422
+ sway_sampling_coef=sway_sampling_coef,
423
+ speed=speed,
424
+ fix_duration=fix_duration,
425
+ device=device,
426
+ )
427
+ )
428
+
429
+
430
+ # infer batches
431
+
432
+
433
+ def infer_batch_process(
434
+ ref_audio,
435
+ ref_text,
436
+ gen_text_batches,
437
+ model_obj,
438
+ vocoder,
439
+ mel_spec_type="vocos",
440
+ progress=tqdm,
441
+ target_rms=0.1,
442
+ cross_fade_duration=0.15,
443
+ nfe_step=32,
444
+ cfg_strength=2.0,
445
+ sway_sampling_coef=-1,
446
+ speed=1,
447
+ fix_duration=None,
448
+ device=None,
449
+ streaming=False,
450
+ chunk_size=2048,
451
+ ):
452
+ audio, sr = ref_audio
453
+ if audio.shape[0] > 1:
454
+ audio = torch.mean(audio, dim=0, keepdim=True)
455
+
456
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
457
+ if rms < target_rms:
458
+ audio = audio * target_rms / rms
459
+ if sr != target_sample_rate:
460
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
461
+ audio = resampler(audio)
462
+ audio = audio.to(device)
463
+
464
+ generated_waves = []
465
+ spectrograms = []
466
+
467
+ if len(ref_text[-1].encode("utf-8")) == 1:
468
+ ref_text = ref_text + " "
469
+
470
+ def process_batch(gen_text):
471
+ local_speed = speed
472
+ if len(gen_text.encode("utf-8")) < 10:
473
+ local_speed = 0.3
474
+
475
+ # Prepare the text
476
+ text_list = [ref_text + gen_text]
477
+ final_text_list = convert_char_to_pinyin(text_list)
478
+
479
+ ref_audio_len = audio.shape[-1] // hop_length
480
+ if fix_duration is not None:
481
+ duration = int(fix_duration * target_sample_rate / hop_length)
482
+ else:
483
+ # Calculate duration
484
+ ref_text_len = len(ref_text.encode("utf-8"))
485
+ gen_text_len = len(gen_text.encode("utf-8"))
486
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / local_speed)
487
+
488
+ # inference
489
+ with torch.inference_mode():
490
+ generated, _ = model_obj.sample(
491
+ cond=audio,
492
+ text=final_text_list,
493
+ duration=duration,
494
+ steps=nfe_step,
495
+ cfg_strength=cfg_strength,
496
+ sway_sampling_coef=sway_sampling_coef,
497
+ )
498
+ del _
499
+
500
+ generated = generated.to(torch.float32) # generated mel spectrogram
501
+ generated = generated[:, ref_audio_len:, :]
502
+ generated = generated.permute(0, 2, 1)
503
+ if mel_spec_type == "vocos":
504
+ generated_wave = vocoder.decode(generated)
505
+ elif mel_spec_type == "bigvgan":
506
+ generated_wave = vocoder(generated)
507
+ if rms < target_rms:
508
+ generated_wave = generated_wave * rms / target_rms
509
+
510
+ # wav -> numpy
511
+ generated_wave = generated_wave.squeeze().cpu().numpy()
512
+
513
+ if streaming:
514
+ for j in range(0, len(generated_wave), chunk_size):
515
+ yield generated_wave[j : j + chunk_size], target_sample_rate
516
+ else:
517
+ generated_cpu = generated[0].cpu().numpy()
518
+ del generated
519
+ yield generated_wave, generated_cpu
520
+
521
+ if streaming:
522
+ for gen_text in progress.tqdm(gen_text_batches) if progress is not None else gen_text_batches:
523
+ for chunk in process_batch(gen_text):
524
+ yield chunk
525
+ else:
526
+ with ThreadPoolExecutor() as executor:
527
+ futures = [executor.submit(process_batch, gen_text) for gen_text in gen_text_batches]
528
+ for future in progress.tqdm(futures) if progress is not None else futures:
529
+ result = future.result()
530
+ if result:
531
+ generated_wave, generated_mel_spec = next(result)
532
+ generated_waves.append(generated_wave)
533
+ spectrograms.append(generated_mel_spec)
534
+
535
+ if generated_waves:
536
+ if cross_fade_duration <= 0:
537
+ # Simply concatenate
538
+ final_wave = np.concatenate(generated_waves)
539
+ else:
540
+ # Combine all generated waves with cross-fading
541
+ final_wave = generated_waves[0]
542
+ for i in range(1, len(generated_waves)):
543
+ prev_wave = final_wave
544
+ next_wave = generated_waves[i]
545
+
546
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
547
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
548
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
549
+
550
+ if cross_fade_samples <= 0:
551
+ # No overlap possible, concatenate
552
+ final_wave = np.concatenate([prev_wave, next_wave])
553
+ continue
554
+
555
+ # Overlapping parts
556
+ prev_overlap = prev_wave[-cross_fade_samples:]
557
+ next_overlap = next_wave[:cross_fade_samples]
558
+
559
+ # Fade out and fade in
560
+ fade_out = np.linspace(1, 0, cross_fade_samples)
561
+ fade_in = np.linspace(0, 1, cross_fade_samples)
562
+
563
+ # Cross-faded overlap
564
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
565
+
566
+ # Combine
567
+ new_wave = np.concatenate(
568
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
569
+ )
570
+
571
+ final_wave = new_wave
572
+
573
+ # Create a combined spectrogram
574
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
575
+
576
+ yield final_wave, target_sample_rate, combined_spectrogram
577
+
578
+ else:
579
+ yield None, target_sample_rate, None
580
+
581
+
582
+ # remove silence from generated wav
583
+
584
+
585
+ def remove_silence_for_generated_wav(filename):
586
+ aseg = AudioSegment.from_file(filename)
587
+ non_silent_segs = silence.split_on_silence(
588
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
589
+ )
590
+ non_silent_wave = AudioSegment.silent(duration=0)
591
+ for non_silent_seg in non_silent_segs:
592
+ non_silent_wave += non_silent_seg
593
+ aseg = non_silent_wave
594
+ aseg.export(filename, format="wav")
595
+
596
+
597
+ # save spectrogram
598
+
599
+
600
+ def save_spectrogram(spectrogram, path):
601
+ plt.figure(figsize=(12, 4))
602
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
603
+ plt.colorbar()
604
+ plt.savefig(path)
605
+ plt.close()
F5-TTS/src/f5_tts/model/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from f5_tts.model.backbones.dit import DiT
2
+ from f5_tts.model.backbones.mmdit import MMDiT
3
+ from f5_tts.model.backbones.unett import UNetT
4
+ from f5_tts.model.cfm import CFM
5
+ from f5_tts.model.trainer import Trainer
6
+
7
+
8
+ __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
F5-TTS/src/f5_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - stable diffusion 3 block structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
F5-TTS/src/f5_tts/model/backbones/dit.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ import torch.nn.functional as F
14
+ from torch import nn
15
+ from x_transformers.x_transformers import RotaryEmbedding
16
+
17
+ from f5_tts.model.modules import (
18
+ AdaLayerNorm_Final,
19
+ ConvNeXtV2Block,
20
+ ConvPositionEmbedding,
21
+ DiTBlock,
22
+ TimestepEmbedding,
23
+ get_pos_embed_indices,
24
+ precompute_freqs_cis,
25
+ )
26
+
27
+
28
+ # Text embedding
29
+
30
+
31
+ class TextEmbedding(nn.Module):
32
+ def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
33
+ super().__init__()
34
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
35
+
36
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
37
+
38
+ if conv_layers > 0:
39
+ self.extra_modeling = True
40
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
41
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
42
+ self.text_blocks = nn.Sequential(
43
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
44
+ )
45
+ else:
46
+ self.extra_modeling = False
47
+
48
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
49
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
50
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
51
+ batch, text_len = text.shape[0], text.shape[1]
52
+ text = F.pad(text, (0, seq_len - text_len), value=0)
53
+ if self.mask_padding:
54
+ text_mask = text == 0
55
+
56
+ if drop_text: # cfg for text
57
+ text = torch.zeros_like(text)
58
+
59
+ text = self.text_embed(text) # b n -> b n d
60
+
61
+ # possible extra modeling
62
+ if self.extra_modeling:
63
+ # sinus pos emb
64
+ batch_start = torch.zeros((batch,), dtype=torch.long)
65
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
66
+ text_pos_embed = self.freqs_cis[pos_idx]
67
+ text = text + text_pos_embed
68
+
69
+ # convnextv2 blocks
70
+ if self.mask_padding:
71
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
72
+ for block in self.text_blocks:
73
+ text = block(text)
74
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
75
+ else:
76
+ text = self.text_blocks(text)
77
+
78
+ return text
79
+
80
+
81
+ # noised input audio and context mixing embedding
82
+
83
+
84
+ class InputEmbedding(nn.Module):
85
+ def __init__(self, mel_dim, text_dim, out_dim):
86
+ super().__init__()
87
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
88
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
89
+
90
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
91
+ if drop_audio_cond: # cfg for cond audio
92
+ cond = torch.zeros_like(cond)
93
+
94
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
95
+ x = self.conv_pos_embed(x) + x
96
+ return x
97
+
98
+
99
+ # Transformer backbone using DiT blocks
100
+
101
+
102
+ class DiT(nn.Module):
103
+ def __init__(
104
+ self,
105
+ *,
106
+ dim,
107
+ depth=8,
108
+ heads=8,
109
+ dim_head=64,
110
+ dropout=0.1,
111
+ ff_mult=4,
112
+ mel_dim=100,
113
+ text_num_embeds=256,
114
+ text_dim=None,
115
+ text_mask_padding=True,
116
+ qk_norm=None,
117
+ conv_layers=0,
118
+ pe_attn_head=None,
119
+ attn_backend="torch", # "torch" | "flash_attn"
120
+ attn_mask_enabled=False,
121
+ long_skip_connection=False,
122
+ checkpoint_activations=False,
123
+ ):
124
+ super().__init__()
125
+
126
+ self.time_embed = TimestepEmbedding(dim)
127
+ if text_dim is None:
128
+ text_dim = mel_dim
129
+ self.text_embed = TextEmbedding(
130
+ text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
131
+ )
132
+ self.text_cond, self.text_uncond = None, None # text cache
133
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
134
+
135
+ self.rotary_embed = RotaryEmbedding(dim_head)
136
+
137
+ self.dim = dim
138
+ self.depth = depth
139
+
140
+ self.transformer_blocks = nn.ModuleList(
141
+ [
142
+ DiTBlock(
143
+ dim=dim,
144
+ heads=heads,
145
+ dim_head=dim_head,
146
+ ff_mult=ff_mult,
147
+ dropout=dropout,
148
+ qk_norm=qk_norm,
149
+ pe_attn_head=pe_attn_head,
150
+ attn_backend=attn_backend,
151
+ attn_mask_enabled=attn_mask_enabled,
152
+ )
153
+ for _ in range(depth)
154
+ ]
155
+ )
156
+ self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
157
+
158
+ self.norm_out = AdaLayerNorm_Final(dim) # final modulation
159
+ self.proj_out = nn.Linear(dim, mel_dim)
160
+
161
+ self.checkpoint_activations = checkpoint_activations
162
+
163
+ self.initialize_weights()
164
+
165
+ def initialize_weights(self):
166
+ # Zero-out AdaLN layers in DiT blocks:
167
+ for block in self.transformer_blocks:
168
+ nn.init.constant_(block.attn_norm.linear.weight, 0)
169
+ nn.init.constant_(block.attn_norm.linear.bias, 0)
170
+
171
+ # Zero-out output layers:
172
+ nn.init.constant_(self.norm_out.linear.weight, 0)
173
+ nn.init.constant_(self.norm_out.linear.bias, 0)
174
+ nn.init.constant_(self.proj_out.weight, 0)
175
+ nn.init.constant_(self.proj_out.bias, 0)
176
+
177
+ def ckpt_wrapper(self, module):
178
+ # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
179
+ def ckpt_forward(*inputs):
180
+ outputs = module(*inputs)
181
+ return outputs
182
+
183
+ return ckpt_forward
184
+
185
+ def get_input_embed(
186
+ self,
187
+ x, # b n d
188
+ cond, # b n d
189
+ text, # b nt
190
+ drop_audio_cond: bool = False,
191
+ drop_text: bool = False,
192
+ cache: bool = True,
193
+ ):
194
+ seq_len = x.shape[1]
195
+ if cache:
196
+ if drop_text:
197
+ if self.text_uncond is None:
198
+ self.text_uncond = self.text_embed(text, seq_len, drop_text=True)
199
+ text_embed = self.text_uncond
200
+ else:
201
+ if self.text_cond is None:
202
+ self.text_cond = self.text_embed(text, seq_len, drop_text=False)
203
+ text_embed = self.text_cond
204
+ else:
205
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
206
+
207
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
208
+
209
+ return x
210
+
211
+ def clear_cache(self):
212
+ self.text_cond, self.text_uncond = None, None
213
+
214
+ def forward(
215
+ self,
216
+ x: float["b n d"], # nosied input audio # noqa: F722
217
+ cond: float["b n d"], # masked cond audio # noqa: F722
218
+ text: int["b nt"], # text # noqa: F722
219
+ time: float["b"] | float[""], # time step # noqa: F821 F722
220
+ mask: bool["b n"] | None = None, # noqa: F722
221
+ drop_audio_cond: bool = False, # cfg for cond audio
222
+ drop_text: bool = False, # cfg for text
223
+ cfg_infer: bool = False, # cfg inference, pack cond & uncond forward
224
+ cache: bool = False,
225
+ ):
226
+ batch, seq_len = x.shape[0], x.shape[1]
227
+ if time.ndim == 0:
228
+ time = time.repeat(batch)
229
+
230
+ # t: conditioning time, text: text, x: noised audio + cond audio + text
231
+ t = self.time_embed(time)
232
+ if cfg_infer: # pack cond & uncond forward: b n d -> 2b n d
233
+ x_cond = self.get_input_embed(x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache)
234
+ x_uncond = self.get_input_embed(x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache)
235
+ x = torch.cat((x_cond, x_uncond), dim=0)
236
+ t = torch.cat((t, t), dim=0)
237
+ mask = torch.cat((mask, mask), dim=0) if mask is not None else None
238
+ else:
239
+ x = self.get_input_embed(x, cond, text, drop_audio_cond=drop_audio_cond, drop_text=drop_text, cache=cache)
240
+
241
+ rope = self.rotary_embed.forward_from_seq_len(seq_len)
242
+
243
+ if self.long_skip_connection is not None:
244
+ residual = x
245
+
246
+ for block in self.transformer_blocks:
247
+ if self.checkpoint_activations:
248
+ # https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
249
+ x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
250
+ else:
251
+ x = block(x, t, mask=mask, rope=rope)
252
+
253
+ if self.long_skip_connection is not None:
254
+ x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
255
+
256
+ x = self.norm_out(x, t)
257
+ output = self.proj_out(x)
258
+
259
+ return output
F5-TTS/src/f5_tts/model/backbones/mmdit.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+ from x_transformers.x_transformers import RotaryEmbedding
15
+
16
+ from f5_tts.model.modules import (
17
+ AdaLayerNorm_Final,
18
+ ConvPositionEmbedding,
19
+ MMDiTBlock,
20
+ TimestepEmbedding,
21
+ get_pos_embed_indices,
22
+ precompute_freqs_cis,
23
+ )
24
+
25
+
26
+ # text embedding
27
+
28
+
29
+ class TextEmbedding(nn.Module):
30
+ def __init__(self, out_dim, text_num_embeds, mask_padding=True):
31
+ super().__init__()
32
+ self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
33
+
34
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
35
+
36
+ self.precompute_max_pos = 1024
37
+ self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
38
+
39
+ def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
40
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
41
+ if self.mask_padding:
42
+ text_mask = text == 0
43
+
44
+ if drop_text: # cfg for text
45
+ text = torch.zeros_like(text)
46
+
47
+ text = self.text_embed(text) # b nt -> b nt d
48
+
49
+ # sinus pos emb
50
+ batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
51
+ batch_text_len = text.shape[1]
52
+ pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
53
+ text_pos_embed = self.freqs_cis[pos_idx]
54
+
55
+ text = text + text_pos_embed
56
+
57
+ if self.mask_padding:
58
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
59
+
60
+ return text
61
+
62
+
63
+ # noised input & masked cond audio embedding
64
+
65
+
66
+ class AudioEmbedding(nn.Module):
67
+ def __init__(self, in_dim, out_dim):
68
+ super().__init__()
69
+ self.linear = nn.Linear(2 * in_dim, out_dim)
70
+ self.conv_pos_embed = ConvPositionEmbedding(out_dim)
71
+
72
+ def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
73
+ if drop_audio_cond:
74
+ cond = torch.zeros_like(cond)
75
+ x = torch.cat((x, cond), dim=-1)
76
+ x = self.linear(x)
77
+ x = self.conv_pos_embed(x) + x
78
+ return x
79
+
80
+
81
+ # Transformer backbone using MM-DiT blocks
82
+
83
+
84
+ class MMDiT(nn.Module):
85
+ def __init__(
86
+ self,
87
+ *,
88
+ dim,
89
+ depth=8,
90
+ heads=8,
91
+ dim_head=64,
92
+ dropout=0.1,
93
+ ff_mult=4,
94
+ mel_dim=100,
95
+ text_num_embeds=256,
96
+ text_mask_padding=True,
97
+ qk_norm=None,
98
+ ):
99
+ super().__init__()
100
+
101
+ self.time_embed = TimestepEmbedding(dim)
102
+ self.text_embed = TextEmbedding(dim, text_num_embeds, mask_padding=text_mask_padding)
103
+ self.text_cond, self.text_uncond = None, None # text cache
104
+ self.audio_embed = AudioEmbedding(mel_dim, dim)
105
+
106
+ self.rotary_embed = RotaryEmbedding(dim_head)
107
+
108
+ self.dim = dim
109
+ self.depth = depth
110
+
111
+ self.transformer_blocks = nn.ModuleList(
112
+ [
113
+ MMDiTBlock(
114
+ dim=dim,
115
+ heads=heads,
116
+ dim_head=dim_head,
117
+ dropout=dropout,
118
+ ff_mult=ff_mult,
119
+ context_pre_only=i == depth - 1,
120
+ qk_norm=qk_norm,
121
+ )
122
+ for i in range(depth)
123
+ ]
124
+ )
125
+ self.norm_out = AdaLayerNorm_Final(dim) # final modulation
126
+ self.proj_out = nn.Linear(dim, mel_dim)
127
+
128
+ self.initialize_weights()
129
+
130
+ def initialize_weights(self):
131
+ # Zero-out AdaLN layers in MMDiT blocks:
132
+ for block in self.transformer_blocks:
133
+ nn.init.constant_(block.attn_norm_x.linear.weight, 0)
134
+ nn.init.constant_(block.attn_norm_x.linear.bias, 0)
135
+ nn.init.constant_(block.attn_norm_c.linear.weight, 0)
136
+ nn.init.constant_(block.attn_norm_c.linear.bias, 0)
137
+
138
+ # Zero-out output layers:
139
+ nn.init.constant_(self.norm_out.linear.weight, 0)
140
+ nn.init.constant_(self.norm_out.linear.bias, 0)
141
+ nn.init.constant_(self.proj_out.weight, 0)
142
+ nn.init.constant_(self.proj_out.bias, 0)
143
+
144
+ def get_input_embed(
145
+ self,
146
+ x, # b n d
147
+ cond, # b n d
148
+ text, # b nt
149
+ drop_audio_cond: bool = False,
150
+ drop_text: bool = False,
151
+ cache: bool = True,
152
+ ):
153
+ if cache:
154
+ if drop_text:
155
+ if self.text_uncond is None:
156
+ self.text_uncond = self.text_embed(text, drop_text=True)
157
+ c = self.text_uncond
158
+ else:
159
+ if self.text_cond is None:
160
+ self.text_cond = self.text_embed(text, drop_text=False)
161
+ c = self.text_cond
162
+ else:
163
+ c = self.text_embed(text, drop_text=drop_text)
164
+ x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
165
+
166
+ return x, c
167
+
168
+ def clear_cache(self):
169
+ self.text_cond, self.text_uncond = None, None
170
+
171
+ def forward(
172
+ self,
173
+ x: float["b n d"], # nosied input audio # noqa: F722
174
+ cond: float["b n d"], # masked cond audio # noqa: F722
175
+ text: int["b nt"], # text # noqa: F722
176
+ time: float["b"] | float[""], # time step # noqa: F821 F722
177
+ mask: bool["b n"] | None = None, # noqa: F722
178
+ drop_audio_cond: bool = False, # cfg for cond audio
179
+ drop_text: bool = False, # cfg for text
180
+ cfg_infer: bool = False, # cfg inference, pack cond & uncond forward
181
+ cache: bool = False,
182
+ ):
183
+ batch = x.shape[0]
184
+ if time.ndim == 0:
185
+ time = time.repeat(batch)
186
+
187
+ # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
188
+ t = self.time_embed(time)
189
+ if cfg_infer: # pack cond & uncond forward: b n d -> 2b n d
190
+ x_cond, c_cond = self.get_input_embed(x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache)
191
+ x_uncond, c_uncond = self.get_input_embed(x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache)
192
+ x = torch.cat((x_cond, x_uncond), dim=0)
193
+ c = torch.cat((c_cond, c_uncond), dim=0)
194
+ t = torch.cat((t, t), dim=0)
195
+ mask = torch.cat((mask, mask), dim=0) if mask is not None else None
196
+ else:
197
+ x, c = self.get_input_embed(
198
+ x, cond, text, drop_audio_cond=drop_audio_cond, drop_text=drop_text, cache=cache
199
+ )
200
+
201
+ seq_len = x.shape[1]
202
+ text_len = text.shape[1]
203
+ rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
204
+ rope_text = self.rotary_embed.forward_from_seq_len(text_len)
205
+
206
+ for block in self.transformer_blocks:
207
+ c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
208
+
209
+ x = self.norm_out(x, t)
210
+ output = self.proj_out(x)
211
+
212
+ return output
F5-TTS/src/f5_tts/model/backbones/unett.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Literal
13
+
14
+ import torch
15
+ import torch.nn.functional as F
16
+ from torch import nn
17
+ from x_transformers import RMSNorm
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from f5_tts.model.modules import (
21
+ Attention,
22
+ AttnProcessor,
23
+ ConvNeXtV2Block,
24
+ ConvPositionEmbedding,
25
+ FeedForward,
26
+ TimestepEmbedding,
27
+ get_pos_embed_indices,
28
+ precompute_freqs_cis,
29
+ )
30
+
31
+
32
+ # Text embedding
33
+
34
+
35
+ class TextEmbedding(nn.Module):
36
+ def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
37
+ super().__init__()
38
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
39
+
40
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
41
+
42
+ if conv_layers > 0:
43
+ self.extra_modeling = True
44
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
45
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
46
+ self.text_blocks = nn.Sequential(
47
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
48
+ )
49
+ else:
50
+ self.extra_modeling = False
51
+
52
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
53
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
54
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
55
+ batch, text_len = text.shape[0], text.shape[1]
56
+ text = F.pad(text, (0, seq_len - text_len), value=0)
57
+ if self.mask_padding:
58
+ text_mask = text == 0
59
+
60
+ if drop_text: # cfg for text
61
+ text = torch.zeros_like(text)
62
+
63
+ text = self.text_embed(text) # b n -> b n d
64
+
65
+ # possible extra modeling
66
+ if self.extra_modeling:
67
+ # sinus pos emb
68
+ batch_start = torch.zeros((batch,), dtype=torch.long)
69
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
70
+ text_pos_embed = self.freqs_cis[pos_idx]
71
+ text = text + text_pos_embed
72
+
73
+ # convnextv2 blocks
74
+ if self.mask_padding:
75
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
76
+ for block in self.text_blocks:
77
+ text = block(text)
78
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
79
+ else:
80
+ text = self.text_blocks(text)
81
+
82
+ return text
83
+
84
+
85
+ # noised input audio and context mixing embedding
86
+
87
+
88
+ class InputEmbedding(nn.Module):
89
+ def __init__(self, mel_dim, text_dim, out_dim):
90
+ super().__init__()
91
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
92
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
93
+
94
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
95
+ if drop_audio_cond: # cfg for cond audio
96
+ cond = torch.zeros_like(cond)
97
+
98
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
99
+ x = self.conv_pos_embed(x) + x
100
+ return x
101
+
102
+
103
+ # Flat UNet Transformer backbone
104
+
105
+
106
+ class UNetT(nn.Module):
107
+ def __init__(
108
+ self,
109
+ *,
110
+ dim,
111
+ depth=8,
112
+ heads=8,
113
+ dim_head=64,
114
+ dropout=0.1,
115
+ ff_mult=4,
116
+ mel_dim=100,
117
+ text_num_embeds=256,
118
+ text_dim=None,
119
+ text_mask_padding=True,
120
+ qk_norm=None,
121
+ conv_layers=0,
122
+ pe_attn_head=None,
123
+ skip_connect_type: Literal["add", "concat", "none"] = "concat",
124
+ ):
125
+ super().__init__()
126
+ assert depth % 2 == 0, "UNet-Transformer's depth should be even."
127
+
128
+ self.time_embed = TimestepEmbedding(dim)
129
+ if text_dim is None:
130
+ text_dim = mel_dim
131
+ self.text_embed = TextEmbedding(
132
+ text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
133
+ )
134
+ self.text_cond, self.text_uncond = None, None # text cache
135
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
136
+
137
+ self.rotary_embed = RotaryEmbedding(dim_head)
138
+
139
+ # transformer layers & skip connections
140
+
141
+ self.dim = dim
142
+ self.skip_connect_type = skip_connect_type
143
+ needs_skip_proj = skip_connect_type == "concat"
144
+
145
+ self.depth = depth
146
+ self.layers = nn.ModuleList([])
147
+
148
+ for idx in range(depth):
149
+ is_later_half = idx >= (depth // 2)
150
+
151
+ attn_norm = RMSNorm(dim)
152
+ attn = Attention(
153
+ processor=AttnProcessor(pe_attn_head=pe_attn_head),
154
+ dim=dim,
155
+ heads=heads,
156
+ dim_head=dim_head,
157
+ dropout=dropout,
158
+ qk_norm=qk_norm,
159
+ )
160
+
161
+ ff_norm = RMSNorm(dim)
162
+ ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
163
+
164
+ skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
165
+
166
+ self.layers.append(
167
+ nn.ModuleList(
168
+ [
169
+ skip_proj,
170
+ attn_norm,
171
+ attn,
172
+ ff_norm,
173
+ ff,
174
+ ]
175
+ )
176
+ )
177
+
178
+ self.norm_out = RMSNorm(dim)
179
+ self.proj_out = nn.Linear(dim, mel_dim)
180
+
181
+ def get_input_embed(
182
+ self,
183
+ x, # b n d
184
+ cond, # b n d
185
+ text, # b nt
186
+ drop_audio_cond: bool = False,
187
+ drop_text: bool = False,
188
+ cache: bool = True,
189
+ ):
190
+ seq_len = x.shape[1]
191
+ if cache:
192
+ if drop_text:
193
+ if self.text_uncond is None:
194
+ self.text_uncond = self.text_embed(text, seq_len, drop_text=True)
195
+ text_embed = self.text_uncond
196
+ else:
197
+ if self.text_cond is None:
198
+ self.text_cond = self.text_embed(text, seq_len, drop_text=False)
199
+ text_embed = self.text_cond
200
+ else:
201
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
202
+
203
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
204
+
205
+ return x
206
+
207
+ def clear_cache(self):
208
+ self.text_cond, self.text_uncond = None, None
209
+
210
+ def forward(
211
+ self,
212
+ x: float["b n d"], # nosied input audio # noqa: F722
213
+ cond: float["b n d"], # masked cond audio # noqa: F722
214
+ text: int["b nt"], # text # noqa: F722
215
+ time: float["b"] | float[""], # time step # noqa: F821 F722
216
+ mask: bool["b n"] | None = None, # noqa: F722
217
+ drop_audio_cond: bool = False, # cfg for cond audio
218
+ drop_text: bool = False, # cfg for text
219
+ cfg_infer: bool = False, # cfg inference, pack cond & uncond forward
220
+ cache: bool = False,
221
+ ):
222
+ batch, seq_len = x.shape[0], x.shape[1]
223
+ if time.ndim == 0:
224
+ time = time.repeat(batch)
225
+
226
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
227
+ t = self.time_embed(time)
228
+ if cfg_infer: # pack cond & uncond forward: b n d -> 2b n d
229
+ x_cond = self.get_input_embed(x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache)
230
+ x_uncond = self.get_input_embed(x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache)
231
+ x = torch.cat((x_cond, x_uncond), dim=0)
232
+ t = torch.cat((t, t), dim=0)
233
+ mask = torch.cat((mask, mask), dim=0) if mask is not None else None
234
+ else:
235
+ x = self.get_input_embed(x, cond, text, drop_audio_cond=drop_audio_cond, drop_text=drop_text, cache=cache)
236
+
237
+ # postfix time t to input x, [b n d] -> [b n+1 d]
238
+ x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
239
+ if mask is not None:
240
+ mask = F.pad(mask, (1, 0), value=1)
241
+
242
+ rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
243
+
244
+ # flat unet transformer
245
+ skip_connect_type = self.skip_connect_type
246
+ skips = []
247
+ for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
248
+ layer = idx + 1
249
+
250
+ # skip connection logic
251
+ is_first_half = layer <= (self.depth // 2)
252
+ is_later_half = not is_first_half
253
+
254
+ if is_first_half:
255
+ skips.append(x)
256
+
257
+ if is_later_half:
258
+ skip = skips.pop()
259
+ if skip_connect_type == "concat":
260
+ x = torch.cat((x, skip), dim=-1)
261
+ x = maybe_skip_proj(x)
262
+ elif skip_connect_type == "add":
263
+ x = x + skip
264
+
265
+ # attention and feedforward blocks
266
+ x = attn(attn_norm(x), rope=rope, mask=mask) + x
267
+ x = ff(ff_norm(x)) + x
268
+
269
+ assert len(skips) == 0
270
+
271
+ x = self.norm_out(x)[:, 1:, :] # unpack t from x
272
+
273
+ return self.proj_out(x)
F5-TTS/src/f5_tts/model/cfm.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from random import random
13
+ from typing import Callable
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ from torch import nn
18
+ from torch.nn.utils.rnn import pad_sequence
19
+ from torchdiffeq import odeint
20
+
21
+ from f5_tts.model.modules import MelSpec
22
+ from f5_tts.model.utils import (
23
+ default,
24
+ exists,
25
+ get_epss_timesteps,
26
+ lens_to_mask,
27
+ list_str_to_idx,
28
+ list_str_to_tensor,
29
+ mask_from_frac_lengths,
30
+ )
31
+
32
+
33
+ class CFM(nn.Module):
34
+ def __init__(
35
+ self,
36
+ transformer: nn.Module,
37
+ sigma=0.0,
38
+ odeint_kwargs: dict = dict(
39
+ # atol = 1e-5,
40
+ # rtol = 1e-5,
41
+ method="euler" # 'midpoint'
42
+ ),
43
+ audio_drop_prob=0.3,
44
+ cond_drop_prob=0.2,
45
+ num_channels=None,
46
+ mel_spec_module: nn.Module | None = None,
47
+ mel_spec_kwargs: dict = dict(),
48
+ frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
49
+ vocab_char_map: dict[str:int] | None = None,
50
+ ):
51
+ super().__init__()
52
+
53
+ self.frac_lengths_mask = frac_lengths_mask
54
+
55
+ # mel spec
56
+ self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
57
+ num_channels = default(num_channels, self.mel_spec.n_mel_channels)
58
+ self.num_channels = num_channels
59
+
60
+ # classifier-free guidance
61
+ self.audio_drop_prob = audio_drop_prob
62
+ self.cond_drop_prob = cond_drop_prob
63
+
64
+ # transformer
65
+ self.transformer = transformer
66
+ dim = transformer.dim
67
+ self.dim = dim
68
+
69
+ # conditional flow related
70
+ self.sigma = sigma
71
+
72
+ # sampling related
73
+ self.odeint_kwargs = odeint_kwargs
74
+
75
+ # vocab map for tokenization
76
+ self.vocab_char_map = vocab_char_map
77
+
78
+ @property
79
+ def device(self):
80
+ return next(self.parameters()).device
81
+
82
+ @torch.no_grad()
83
+ def sample(
84
+ self,
85
+ cond: float["b n d"] | float["b nw"], # noqa: F722
86
+ text: int["b nt"] | list[str], # noqa: F722
87
+ duration: int | int["b"], # noqa: F821
88
+ *,
89
+ lens: int["b"] | None = None, # noqa: F821
90
+ steps=32,
91
+ cfg_strength=1.0,
92
+ sway_sampling_coef=None,
93
+ seed: int | None = None,
94
+ max_duration=4096,
95
+ vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
96
+ use_epss=True,
97
+ no_ref_audio=False,
98
+ duplicate_test=False,
99
+ t_inter=0.1,
100
+ edit_mask=None,
101
+ ):
102
+ self.eval()
103
+ # raw wave
104
+
105
+ if cond.ndim == 2:
106
+ cond = self.mel_spec(cond)
107
+ cond = cond.permute(0, 2, 1)
108
+ assert cond.shape[-1] == self.num_channels
109
+
110
+ cond = cond.to(next(self.parameters()).dtype)
111
+
112
+ batch, cond_seq_len, device = *cond.shape[:2], cond.device
113
+ if not exists(lens):
114
+ lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
115
+
116
+ # text
117
+
118
+ if isinstance(text, list):
119
+ if exists(self.vocab_char_map):
120
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
121
+ else:
122
+ text = list_str_to_tensor(text).to(device)
123
+ assert text.shape[0] == batch
124
+
125
+ # duration
126
+
127
+ cond_mask = lens_to_mask(lens)
128
+ if edit_mask is not None:
129
+ cond_mask = cond_mask & edit_mask
130
+
131
+ if isinstance(duration, int):
132
+ duration = torch.full((batch,), duration, device=device, dtype=torch.long)
133
+
134
+ duration = torch.maximum(
135
+ torch.maximum((text != -1).sum(dim=-1), lens) + 1, duration
136
+ ) # duration at least text/audio prompt length plus one token, so something is generated
137
+ duration = duration.clamp(max=max_duration)
138
+ max_duration = duration.amax()
139
+
140
+ # duplicate test corner for inner time step oberservation
141
+ if duplicate_test:
142
+ test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
143
+
144
+ cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
145
+ if no_ref_audio:
146
+ cond = torch.zeros_like(cond)
147
+
148
+ cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
149
+ cond_mask = cond_mask.unsqueeze(-1)
150
+ step_cond = torch.where(
151
+ cond_mask, cond, torch.zeros_like(cond)
152
+ ) # allow direct control (cut cond audio) with lens passed in
153
+
154
+ if batch > 1:
155
+ mask = lens_to_mask(duration)
156
+ else: # save memory and speed up, as single inference need no mask currently
157
+ mask = None
158
+
159
+ # neural ode
160
+
161
+ def fn(t, x):
162
+ # at each step, conditioning is fixed
163
+ # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
164
+
165
+ # predict flow (cond)
166
+ if cfg_strength < 1e-5:
167
+ pred = self.transformer(
168
+ x=x,
169
+ cond=step_cond,
170
+ text=text,
171
+ time=t,
172
+ mask=mask,
173
+ drop_audio_cond=False,
174
+ drop_text=False,
175
+ cache=True,
176
+ )
177
+ return pred
178
+
179
+ # predict flow (cond and uncond), for classifier-free guidance
180
+ pred_cfg = self.transformer(
181
+ x=x,
182
+ cond=step_cond,
183
+ text=text,
184
+ time=t,
185
+ mask=mask,
186
+ cfg_infer=True,
187
+ cache=True,
188
+ )
189
+ pred, null_pred = torch.chunk(pred_cfg, 2, dim=0)
190
+ return pred + (pred - null_pred) * cfg_strength
191
+
192
+ # noise input
193
+ # to make sure batch inference result is same with different batch size, and for sure single inference
194
+ # still some difference maybe due to convolutional layers
195
+ y0 = []
196
+ for dur in duration:
197
+ if exists(seed):
198
+ torch.manual_seed(seed)
199
+ y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
200
+ y0 = pad_sequence(y0, padding_value=0, batch_first=True)
201
+
202
+ t_start = 0
203
+
204
+ # duplicate test corner for inner time step oberservation
205
+ if duplicate_test:
206
+ t_start = t_inter
207
+ y0 = (1 - t_start) * y0 + t_start * test_cond
208
+ steps = int(steps * (1 - t_start))
209
+
210
+ if t_start == 0 and use_epss: # use Empirically Pruned Step Sampling for low NFE
211
+ t = get_epss_timesteps(steps, device=self.device, dtype=step_cond.dtype)
212
+ else:
213
+ t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype)
214
+ if sway_sampling_coef is not None:
215
+ t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
216
+
217
+ trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
218
+ self.transformer.clear_cache()
219
+
220
+ sampled = trajectory[-1]
221
+ out = sampled
222
+ out = torch.where(cond_mask, cond, out)
223
+
224
+ if exists(vocoder):
225
+ out = out.permute(0, 2, 1)
226
+ out = vocoder(out)
227
+
228
+ return out, trajectory
229
+
230
+ def forward(
231
+ self,
232
+ inp: float["b n d"] | float["b nw"], # mel or raw wave # noqa: F722
233
+ text: int["b nt"] | list[str], # noqa: F722
234
+ *,
235
+ lens: int["b"] | None = None, # noqa: F821
236
+ noise_scheduler: str | None = None,
237
+ ):
238
+ # handle raw wave
239
+ if inp.ndim == 2:
240
+ inp = self.mel_spec(inp)
241
+ inp = inp.permute(0, 2, 1)
242
+ assert inp.shape[-1] == self.num_channels
243
+
244
+ batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
245
+
246
+ # handle text as string
247
+ if isinstance(text, list):
248
+ if exists(self.vocab_char_map):
249
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
250
+ else:
251
+ text = list_str_to_tensor(text).to(device)
252
+ assert text.shape[0] == batch
253
+
254
+ # lens and mask
255
+ if not exists(lens):
256
+ lens = torch.full((batch,), seq_len, device=device)
257
+
258
+ mask = lens_to_mask(lens, length=seq_len) # useless here, as collate_fn will pad to max length in batch
259
+
260
+ # get a random span to mask out for training conditionally
261
+ frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
262
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
263
+
264
+ if exists(mask):
265
+ rand_span_mask &= mask
266
+
267
+ # mel is x1
268
+ x1 = inp
269
+
270
+ # x0 is gaussian noise
271
+ x0 = torch.randn_like(x1)
272
+
273
+ # time step
274
+ time = torch.rand((batch,), dtype=dtype, device=self.device)
275
+ # TODO. noise_scheduler
276
+
277
+ # sample xt (φ_t(x) in the paper)
278
+ t = time.unsqueeze(-1).unsqueeze(-1)
279
+ φ = (1 - t) * x0 + t * x1
280
+ flow = x1 - x0
281
+
282
+ # only predict what is within the random mask span for infilling
283
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
284
+
285
+ # transformer and cfg training with a drop rate
286
+ drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
287
+ if random() < self.cond_drop_prob: # p_uncond in voicebox paper
288
+ drop_audio_cond = True
289
+ drop_text = True
290
+ else:
291
+ drop_text = False
292
+
293
+ # apply mask will use more memory; might adjust batchsize or batchsampler long sequence threshold
294
+ pred = self.transformer(
295
+ x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text, mask=mask
296
+ )
297
+
298
+ # flow matching loss
299
+ loss = F.mse_loss(pred, flow, reduction="none")
300
+ loss = loss[rand_span_mask]
301
+
302
+ return loss.mean(), cond, pred
F5-TTS/src/f5_tts/model/dataset.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from importlib.resources import files
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import torchaudio
7
+ from datasets import Dataset as Dataset_
8
+ from datasets import load_from_disk
9
+ from torch import nn
10
+ from torch.utils.data import Dataset, Sampler
11
+ from tqdm import tqdm
12
+
13
+ from f5_tts.model.modules import MelSpec
14
+ from f5_tts.model.utils import default
15
+
16
+
17
+ class HFDataset(Dataset):
18
+ def __init__(
19
+ self,
20
+ hf_dataset: Dataset,
21
+ target_sample_rate=24_000,
22
+ n_mel_channels=100,
23
+ hop_length=256,
24
+ n_fft=1024,
25
+ win_length=1024,
26
+ mel_spec_type="vocos",
27
+ ):
28
+ self.data = hf_dataset
29
+ self.target_sample_rate = target_sample_rate
30
+ self.hop_length = hop_length
31
+
32
+ self.mel_spectrogram = MelSpec(
33
+ n_fft=n_fft,
34
+ hop_length=hop_length,
35
+ win_length=win_length,
36
+ n_mel_channels=n_mel_channels,
37
+ target_sample_rate=target_sample_rate,
38
+ mel_spec_type=mel_spec_type,
39
+ )
40
+
41
+ def get_frame_len(self, index):
42
+ row = self.data[index]
43
+ audio = row["audio"]["array"]
44
+ sample_rate = row["audio"]["sampling_rate"]
45
+ return audio.shape[-1] / sample_rate * self.target_sample_rate / self.hop_length
46
+
47
+ def __len__(self):
48
+ return len(self.data)
49
+
50
+ def __getitem__(self, index):
51
+ row = self.data[index]
52
+ audio = row["audio"]["array"]
53
+
54
+ # logger.info(f"Audio shape: {audio.shape}")
55
+
56
+ sample_rate = row["audio"]["sampling_rate"]
57
+ duration = audio.shape[-1] / sample_rate
58
+
59
+ if duration > 30 or duration < 0.3:
60
+ return self.__getitem__((index + 1) % len(self.data))
61
+
62
+ audio_tensor = torch.from_numpy(audio).float()
63
+
64
+ if sample_rate != self.target_sample_rate:
65
+ resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
66
+ audio_tensor = resampler(audio_tensor)
67
+
68
+ audio_tensor = audio_tensor.unsqueeze(0) # 't -> 1 t')
69
+
70
+ mel_spec = self.mel_spectrogram(audio_tensor)
71
+
72
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
73
+
74
+ text = row["text"]
75
+
76
+ return dict(
77
+ mel_spec=mel_spec,
78
+ text=text,
79
+ )
80
+
81
+
82
+ class CustomDataset(Dataset):
83
+ def __init__(
84
+ self,
85
+ custom_dataset: Dataset,
86
+ durations=None,
87
+ target_sample_rate=24_000,
88
+ hop_length=256,
89
+ n_mel_channels=100,
90
+ n_fft=1024,
91
+ win_length=1024,
92
+ mel_spec_type="vocos",
93
+ preprocessed_mel=False,
94
+ mel_spec_module: nn.Module | None = None,
95
+ ):
96
+ self.data = custom_dataset
97
+ self.durations = durations
98
+ self.target_sample_rate = target_sample_rate
99
+ self.hop_length = hop_length
100
+ self.n_fft = n_fft
101
+ self.win_length = win_length
102
+ self.mel_spec_type = mel_spec_type
103
+ self.preprocessed_mel = preprocessed_mel
104
+
105
+ if not preprocessed_mel:
106
+ self.mel_spectrogram = default(
107
+ mel_spec_module,
108
+ MelSpec(
109
+ n_fft=n_fft,
110
+ hop_length=hop_length,
111
+ win_length=win_length,
112
+ n_mel_channels=n_mel_channels,
113
+ target_sample_rate=target_sample_rate,
114
+ mel_spec_type=mel_spec_type,
115
+ ),
116
+ )
117
+
118
+ def get_frame_len(self, index):
119
+ if (
120
+ self.durations is not None
121
+ ): # Please make sure the separately provided durations are correct, otherwise 99.99% OOM
122
+ return self.durations[index] * self.target_sample_rate / self.hop_length
123
+ return self.data[index]["duration"] * self.target_sample_rate / self.hop_length
124
+
125
+ def __len__(self):
126
+ return len(self.data)
127
+
128
+ def __getitem__(self, index):
129
+ while True:
130
+ row = self.data[index]
131
+ audio_path = row["audio_path"]
132
+ text = row["text"]
133
+ duration = row["duration"]
134
+
135
+ # filter by given length
136
+ if 0.3 <= duration <= 30:
137
+ break # valid
138
+
139
+ index = (index + 1) % len(self.data)
140
+
141
+ if self.preprocessed_mel:
142
+ mel_spec = torch.tensor(row["mel_spec"])
143
+ else:
144
+ audio, source_sample_rate = torchaudio.load(audio_path)
145
+
146
+ # make sure mono input
147
+ if audio.shape[0] > 1:
148
+ audio = torch.mean(audio, dim=0, keepdim=True)
149
+
150
+ # resample if necessary
151
+ if source_sample_rate != self.target_sample_rate:
152
+ resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
153
+ audio = resampler(audio)
154
+
155
+ # to mel spectrogram
156
+ mel_spec = self.mel_spectrogram(audio)
157
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
158
+
159
+ return {
160
+ "mel_spec": mel_spec,
161
+ "text": text,
162
+ }
163
+
164
+
165
+ # Dynamic Batch Sampler
166
+ class DynamicBatchSampler(Sampler[list[int]]):
167
+ """Extension of Sampler that will do the following:
168
+ 1. Change the batch size (essentially number of sequences)
169
+ in a batch to ensure that the total number of frames are less
170
+ than a certain threshold.
171
+ 2. Make sure the padding efficiency in the batch is high.
172
+ 3. Shuffle batches each epoch while maintaining reproducibility.
173
+ """
174
+
175
+ def __init__(
176
+ self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_residual: bool = False
177
+ ):
178
+ self.sampler = sampler
179
+ self.frames_threshold = frames_threshold
180
+ self.max_samples = max_samples
181
+ self.random_seed = random_seed
182
+ self.epoch = 0
183
+
184
+ indices, batches = [], []
185
+ data_source = self.sampler.data_source
186
+
187
+ for idx in tqdm(
188
+ self.sampler, desc="Sorting with sampler... if slow, check whether dataset is provided with duration"
189
+ ):
190
+ indices.append((idx, data_source.get_frame_len(idx)))
191
+ indices.sort(key=lambda elem: elem[1])
192
+
193
+ batch = []
194
+ batch_frames = 0
195
+ for idx, frame_len in tqdm(
196
+ indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
197
+ ):
198
+ if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
199
+ batch.append(idx)
200
+ batch_frames += frame_len
201
+ else:
202
+ if len(batch) > 0:
203
+ batches.append(batch)
204
+ if frame_len <= self.frames_threshold:
205
+ batch = [idx]
206
+ batch_frames = frame_len
207
+ else:
208
+ batch = []
209
+ batch_frames = 0
210
+
211
+ if not drop_residual and len(batch) > 0:
212
+ batches.append(batch)
213
+
214
+ del indices
215
+ self.batches = batches
216
+
217
+ # Ensure even batches with accelerate BatchSamplerShard cls under frame_per_batch setting
218
+ self.drop_last = True
219
+
220
+ def set_epoch(self, epoch: int) -> None:
221
+ """Sets the epoch for this sampler."""
222
+ self.epoch = epoch
223
+
224
+ def __iter__(self):
225
+ # Use both random_seed and epoch for deterministic but different shuffling per epoch
226
+ if self.random_seed is not None:
227
+ g = torch.Generator()
228
+ g.manual_seed(self.random_seed + self.epoch)
229
+ # Use PyTorch's random permutation for better reproducibility across PyTorch versions
230
+ indices = torch.randperm(len(self.batches), generator=g).tolist()
231
+ batches = [self.batches[i] for i in indices]
232
+ else:
233
+ batches = self.batches
234
+ return iter(batches)
235
+
236
+ def __len__(self):
237
+ return len(self.batches)
238
+
239
+
240
+ # Load dataset
241
+
242
+
243
+ def load_dataset(
244
+ dataset_name: str,
245
+ tokenizer: str = "pinyin",
246
+ dataset_type: str = "CustomDataset",
247
+ audio_type: str = "raw",
248
+ mel_spec_module: nn.Module | None = None,
249
+ mel_spec_kwargs: dict = dict(),
250
+ ) -> CustomDataset | HFDataset:
251
+ """
252
+ dataset_type - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
253
+ - "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
254
+ """
255
+
256
+ print("Loading dataset ...")
257
+
258
+ if dataset_type == "CustomDataset":
259
+ rel_data_path = str(files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}"))
260
+ if audio_type == "raw":
261
+ try:
262
+ train_dataset = load_from_disk(f"{rel_data_path}/raw")
263
+ except: # noqa: E722
264
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/raw.arrow")
265
+ preprocessed_mel = False
266
+ elif audio_type == "mel":
267
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/mel.arrow")
268
+ preprocessed_mel = True
269
+ with open(f"{rel_data_path}/duration.json", "r", encoding="utf-8") as f:
270
+ data_dict = json.load(f)
271
+ durations = data_dict["duration"]
272
+ train_dataset = CustomDataset(
273
+ train_dataset,
274
+ durations=durations,
275
+ preprocessed_mel=preprocessed_mel,
276
+ mel_spec_module=mel_spec_module,
277
+ **mel_spec_kwargs,
278
+ )
279
+
280
+ elif dataset_type == "CustomDatasetPath":
281
+ try:
282
+ train_dataset = load_from_disk(f"{dataset_name}/raw")
283
+ except: # noqa: E722
284
+ train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")
285
+
286
+ with open(f"{dataset_name}/duration.json", "r", encoding="utf-8") as f:
287
+ data_dict = json.load(f)
288
+ durations = data_dict["duration"]
289
+ train_dataset = CustomDataset(
290
+ train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs
291
+ )
292
+
293
+ elif dataset_type == "HFDataset":
294
+ print(
295
+ "Should manually modify the path of huggingface dataset to your need.\n"
296
+ + "May also the corresponding script cuz different dataset may have different format."
297
+ )
298
+ pre, post = dataset_name.split("_")
299
+ train_dataset = HFDataset(
300
+ load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir=str(files("f5_tts").joinpath("../../data"))),
301
+ )
302
+
303
+ return train_dataset
304
+
305
+
306
+ # collation
307
+
308
+
309
+ def collate_fn(batch):
310
+ mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
311
+ mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
312
+ max_mel_length = mel_lengths.amax()
313
+
314
+ padded_mel_specs = []
315
+ for spec in mel_specs:
316
+ padding = (0, max_mel_length - spec.size(-1))
317
+ padded_spec = F.pad(spec, padding, value=0)
318
+ padded_mel_specs.append(padded_spec)
319
+
320
+ mel_specs = torch.stack(padded_mel_specs)
321
+
322
+ text = [item["text"] for item in batch]
323
+ text_lengths = torch.LongTensor([len(item) for item in text])
324
+
325
+ return dict(
326
+ mel=mel_specs,
327
+ mel_lengths=mel_lengths, # records for padding mask
328
+ text=text,
329
+ text_lengths=text_lengths,
330
+ )
F5-TTS/src/f5_tts/model/modules.py ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+ # flake8: noqa
10
+
11
+ from __future__ import annotations
12
+
13
+ import math
14
+ from typing import Optional
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ import torchaudio
19
+ from librosa.filters import mel as librosa_mel_fn
20
+ from torch import nn
21
+ from x_transformers.x_transformers import apply_rotary_pos_emb
22
+
23
+ from f5_tts.model.utils import is_package_available
24
+
25
+
26
+ # raw wav to mel spec
27
+
28
+
29
+ mel_basis_cache = {}
30
+ hann_window_cache = {}
31
+
32
+
33
+ def get_bigvgan_mel_spectrogram(
34
+ waveform,
35
+ n_fft=1024,
36
+ n_mel_channels=100,
37
+ target_sample_rate=24000,
38
+ hop_length=256,
39
+ win_length=1024,
40
+ fmin=0,
41
+ fmax=None,
42
+ center=False,
43
+ ): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
44
+ device = waveform.device
45
+ key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
46
+
47
+ if key not in mel_basis_cache:
48
+ mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
49
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
50
+ hann_window_cache[key] = torch.hann_window(win_length).to(device)
51
+
52
+ mel_basis = mel_basis_cache[key]
53
+ hann_window = hann_window_cache[key]
54
+
55
+ padding = (n_fft - hop_length) // 2
56
+ waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
57
+
58
+ spec = torch.stft(
59
+ waveform,
60
+ n_fft,
61
+ hop_length=hop_length,
62
+ win_length=win_length,
63
+ window=hann_window,
64
+ center=center,
65
+ pad_mode="reflect",
66
+ normalized=False,
67
+ onesided=True,
68
+ return_complex=True,
69
+ )
70
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
71
+
72
+ mel_spec = torch.matmul(mel_basis, spec)
73
+ mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
74
+
75
+ return mel_spec
76
+
77
+
78
+ def get_vocos_mel_spectrogram(
79
+ waveform,
80
+ n_fft=1024,
81
+ n_mel_channels=100,
82
+ target_sample_rate=24000,
83
+ hop_length=256,
84
+ win_length=1024,
85
+ ):
86
+ mel_stft = torchaudio.transforms.MelSpectrogram(
87
+ sample_rate=target_sample_rate,
88
+ n_fft=n_fft,
89
+ win_length=win_length,
90
+ hop_length=hop_length,
91
+ n_mels=n_mel_channels,
92
+ power=1,
93
+ center=True,
94
+ normalized=False,
95
+ norm=None,
96
+ ).to(waveform.device)
97
+ if len(waveform.shape) == 3:
98
+ waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
99
+
100
+ assert len(waveform.shape) == 2
101
+
102
+ mel = mel_stft(waveform)
103
+ mel = mel.clamp(min=1e-5).log()
104
+ return mel
105
+
106
+
107
+ class MelSpec(nn.Module):
108
+ def __init__(
109
+ self,
110
+ n_fft=1024,
111
+ hop_length=256,
112
+ win_length=1024,
113
+ n_mel_channels=100,
114
+ target_sample_rate=24_000,
115
+ mel_spec_type="vocos",
116
+ ):
117
+ super().__init__()
118
+ assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
119
+
120
+ self.n_fft = n_fft
121
+ self.hop_length = hop_length
122
+ self.win_length = win_length
123
+ self.n_mel_channels = n_mel_channels
124
+ self.target_sample_rate = target_sample_rate
125
+
126
+ if mel_spec_type == "vocos":
127
+ self.extractor = get_vocos_mel_spectrogram
128
+ elif mel_spec_type == "bigvgan":
129
+ self.extractor = get_bigvgan_mel_spectrogram
130
+
131
+ self.register_buffer("dummy", torch.tensor(0), persistent=False)
132
+
133
+ def forward(self, wav):
134
+ if self.dummy.device != wav.device:
135
+ self.to(wav.device)
136
+
137
+ mel = self.extractor(
138
+ waveform=wav,
139
+ n_fft=self.n_fft,
140
+ n_mel_channels=self.n_mel_channels,
141
+ target_sample_rate=self.target_sample_rate,
142
+ hop_length=self.hop_length,
143
+ win_length=self.win_length,
144
+ )
145
+
146
+ return mel
147
+
148
+
149
+ # sinusoidal position embedding
150
+
151
+
152
+ class SinusPositionEmbedding(nn.Module):
153
+ def __init__(self, dim):
154
+ super().__init__()
155
+ self.dim = dim
156
+
157
+ def forward(self, x, scale=1000):
158
+ device = x.device
159
+ half_dim = self.dim // 2
160
+ emb = math.log(10000) / (half_dim - 1)
161
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
162
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
163
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
164
+ return emb
165
+
166
+
167
+ # convolutional position embedding
168
+
169
+
170
+ class ConvPositionEmbedding(nn.Module):
171
+ def __init__(self, dim, kernel_size=31, groups=16):
172
+ super().__init__()
173
+ assert kernel_size % 2 != 0
174
+ self.conv1d = nn.Sequential(
175
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
176
+ nn.Mish(),
177
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
178
+ nn.Mish(),
179
+ )
180
+
181
+ def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):
182
+ if mask is not None:
183
+ mask = mask[..., None]
184
+ x = x.masked_fill(~mask, 0.0)
185
+
186
+ x = x.permute(0, 2, 1)
187
+ x = self.conv1d(x)
188
+ out = x.permute(0, 2, 1)
189
+
190
+ if mask is not None:
191
+ out = out.masked_fill(~mask, 0.0)
192
+
193
+ return out
194
+
195
+
196
+ # rotary positional embedding related
197
+
198
+
199
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
200
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
201
+ # has some connection to NTK literature
202
+ # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
203
+ # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
204
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
205
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
206
+ t = torch.arange(end, device=freqs.device) # type: ignore
207
+ freqs = torch.outer(t, freqs).float() # type: ignore
208
+ freqs_cos = torch.cos(freqs) # real part
209
+ freqs_sin = torch.sin(freqs) # imaginary part
210
+ return torch.cat([freqs_cos, freqs_sin], dim=-1)
211
+
212
+
213
+ def get_pos_embed_indices(start, length, max_pos, scale=1.0):
214
+ # length = length if isinstance(length, int) else length.max()
215
+ scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
216
+ pos = (
217
+ start.unsqueeze(1)
218
+ + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
219
+ )
220
+ # avoid extra long error.
221
+ pos = torch.where(pos < max_pos, pos, max_pos - 1)
222
+ return pos
223
+
224
+
225
+ # Global Response Normalization layer (Instance Normalization ?)
226
+
227
+
228
+ class GRN(nn.Module):
229
+ def __init__(self, dim):
230
+ super().__init__()
231
+ self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
232
+ self.beta = nn.Parameter(torch.zeros(1, 1, dim))
233
+
234
+ def forward(self, x):
235
+ Gx = torch.norm(x, p=2, dim=1, keepdim=True)
236
+ Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
237
+ return self.gamma * (x * Nx) + self.beta + x
238
+
239
+
240
+ # ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
241
+ # ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
242
+
243
+
244
+ class ConvNeXtV2Block(nn.Module):
245
+ def __init__(
246
+ self,
247
+ dim: int,
248
+ intermediate_dim: int,
249
+ dilation: int = 1,
250
+ ):
251
+ super().__init__()
252
+ padding = (dilation * (7 - 1)) // 2
253
+ self.dwconv = nn.Conv1d(
254
+ dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
255
+ ) # depthwise conv
256
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
257
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
258
+ self.act = nn.GELU()
259
+ self.grn = GRN(intermediate_dim)
260
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
261
+
262
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
263
+ residual = x
264
+ x = x.transpose(1, 2) # b n d -> b d n
265
+ x = self.dwconv(x)
266
+ x = x.transpose(1, 2) # b d n -> b n d
267
+ x = self.norm(x)
268
+ x = self.pwconv1(x)
269
+ x = self.act(x)
270
+ x = self.grn(x)
271
+ x = self.pwconv2(x)
272
+ return residual + x
273
+
274
+
275
+ # RMSNorm
276
+
277
+
278
+ class RMSNorm(nn.Module):
279
+ def __init__(self, dim: int, eps: float):
280
+ super().__init__()
281
+ self.eps = eps
282
+ self.weight = nn.Parameter(torch.ones(dim))
283
+ self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
284
+
285
+ def forward(self, x):
286
+ if self.native_rms_norm:
287
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
288
+ x = x.to(self.weight.dtype)
289
+ x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
290
+ else:
291
+ variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
292
+ x = x * torch.rsqrt(variance + self.eps)
293
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
294
+ x = x.to(self.weight.dtype)
295
+ x = x * self.weight
296
+
297
+ return x
298
+
299
+
300
+ # AdaLayerNorm
301
+ # return with modulated x for attn input, and params for later mlp modulation
302
+
303
+
304
+ class AdaLayerNorm(nn.Module):
305
+ def __init__(self, dim):
306
+ super().__init__()
307
+
308
+ self.silu = nn.SiLU()
309
+ self.linear = nn.Linear(dim, dim * 6)
310
+
311
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
312
+
313
+ def forward(self, x, emb=None):
314
+ emb = self.linear(self.silu(emb))
315
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
316
+
317
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
318
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
319
+
320
+
321
+ # AdaLayerNorm for final layer
322
+ # return only with modulated x for attn input, cuz no more mlp modulation
323
+
324
+
325
+ class AdaLayerNorm_Final(nn.Module):
326
+ def __init__(self, dim):
327
+ super().__init__()
328
+
329
+ self.silu = nn.SiLU()
330
+ self.linear = nn.Linear(dim, dim * 2)
331
+
332
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
333
+
334
+ def forward(self, x, emb):
335
+ emb = self.linear(self.silu(emb))
336
+ scale, shift = torch.chunk(emb, 2, dim=1)
337
+
338
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
339
+ return x
340
+
341
+
342
+ # FeedForward
343
+
344
+
345
+ class FeedForward(nn.Module):
346
+ def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
347
+ super().__init__()
348
+ inner_dim = int(dim * mult)
349
+ dim_out = dim_out if dim_out is not None else dim
350
+
351
+ activation = nn.GELU(approximate=approximate)
352
+ project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
353
+ self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
354
+
355
+ def forward(self, x):
356
+ return self.ff(x)
357
+
358
+
359
+ # Attention with possible joint part
360
+ # modified from diffusers/src/diffusers/models/attention_processor.py
361
+
362
+
363
+ class Attention(nn.Module):
364
+ def __init__(
365
+ self,
366
+ processor: JointAttnProcessor | AttnProcessor,
367
+ dim: int,
368
+ heads: int = 8,
369
+ dim_head: int = 64,
370
+ dropout: float = 0.0,
371
+ context_dim: Optional[int] = None, # if not None -> joint attention
372
+ context_pre_only: bool = False,
373
+ qk_norm: Optional[str] = None,
374
+ ):
375
+ super().__init__()
376
+
377
+ if not hasattr(F, "scaled_dot_product_attention"):
378
+ raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
379
+
380
+ self.processor = processor
381
+
382
+ self.dim = dim
383
+ self.heads = heads
384
+ self.inner_dim = dim_head * heads
385
+ self.dropout = dropout
386
+
387
+ self.context_dim = context_dim
388
+ self.context_pre_only = context_pre_only
389
+
390
+ self.to_q = nn.Linear(dim, self.inner_dim)
391
+ self.to_k = nn.Linear(dim, self.inner_dim)
392
+ self.to_v = nn.Linear(dim, self.inner_dim)
393
+
394
+ if qk_norm is None:
395
+ self.q_norm = None
396
+ self.k_norm = None
397
+ elif qk_norm == "rms_norm":
398
+ self.q_norm = RMSNorm(dim_head, eps=1e-6)
399
+ self.k_norm = RMSNorm(dim_head, eps=1e-6)
400
+ else:
401
+ raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
402
+
403
+ if self.context_dim is not None:
404
+ self.to_q_c = nn.Linear(context_dim, self.inner_dim)
405
+ self.to_k_c = nn.Linear(context_dim, self.inner_dim)
406
+ self.to_v_c = nn.Linear(context_dim, self.inner_dim)
407
+ if qk_norm is None:
408
+ self.c_q_norm = None
409
+ self.c_k_norm = None
410
+ elif qk_norm == "rms_norm":
411
+ self.c_q_norm = RMSNorm(dim_head, eps=1e-6)
412
+ self.c_k_norm = RMSNorm(dim_head, eps=1e-6)
413
+
414
+ self.to_out = nn.ModuleList([])
415
+ self.to_out.append(nn.Linear(self.inner_dim, dim))
416
+ self.to_out.append(nn.Dropout(dropout))
417
+
418
+ if self.context_dim is not None and not self.context_pre_only:
419
+ self.to_out_c = nn.Linear(self.inner_dim, context_dim)
420
+
421
+ def forward(
422
+ self,
423
+ x: float["b n d"], # noised input x
424
+ c: float["b n d"] = None, # context c
425
+ mask: bool["b n"] | None = None,
426
+ rope=None, # rotary position embedding for x
427
+ c_rope=None, # rotary position embedding for c
428
+ ) -> torch.Tensor:
429
+ if c is not None:
430
+ return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
431
+ else:
432
+ return self.processor(self, x, mask=mask, rope=rope)
433
+
434
+
435
+ # Attention processor
436
+
437
+ if is_package_available("flash_attn"):
438
+ from flash_attn.bert_padding import pad_input, unpad_input
439
+ from flash_attn import flash_attn_varlen_func, flash_attn_func
440
+
441
+
442
+ class AttnProcessor:
443
+ def __init__(
444
+ self,
445
+ pe_attn_head: int | None = None, # number of attention head to apply rope, None for all
446
+ attn_backend: str = "torch", # "torch" or "flash_attn"
447
+ attn_mask_enabled: bool = True,
448
+ ):
449
+ if attn_backend == "flash_attn":
450
+ assert is_package_available("flash_attn"), "Please install flash-attn first."
451
+
452
+ self.pe_attn_head = pe_attn_head
453
+ self.attn_backend = attn_backend
454
+ self.attn_mask_enabled = attn_mask_enabled
455
+
456
+ def __call__(
457
+ self,
458
+ attn: Attention,
459
+ x: float["b n d"], # noised input x
460
+ mask: bool["b n"] | None = None,
461
+ rope=None, # rotary position embedding
462
+ ) -> torch.FloatTensor:
463
+ batch_size = x.shape[0]
464
+
465
+ # `sample` projections
466
+ query = attn.to_q(x)
467
+ key = attn.to_k(x)
468
+ value = attn.to_v(x)
469
+
470
+ # attention
471
+ inner_dim = key.shape[-1]
472
+ head_dim = inner_dim // attn.heads
473
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
474
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
475
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
476
+
477
+ # qk norm
478
+ if attn.q_norm is not None:
479
+ query = attn.q_norm(query)
480
+ if attn.k_norm is not None:
481
+ key = attn.k_norm(key)
482
+
483
+ # apply rotary position embedding
484
+ if rope is not None:
485
+ freqs, xpos_scale = rope
486
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
487
+
488
+ if self.pe_attn_head is not None:
489
+ pn = self.pe_attn_head
490
+ query[:, :pn, :, :] = apply_rotary_pos_emb(query[:, :pn, :, :], freqs, q_xpos_scale)
491
+ key[:, :pn, :, :] = apply_rotary_pos_emb(key[:, :pn, :, :], freqs, k_xpos_scale)
492
+ else:
493
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
494
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
495
+
496
+ if self.attn_backend == "torch":
497
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
498
+ if self.attn_mask_enabled and mask is not None:
499
+ attn_mask = mask
500
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
501
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
502
+ else:
503
+ attn_mask = None
504
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
505
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
506
+
507
+ elif self.attn_backend == "flash_attn":
508
+ query = query.transpose(1, 2) # [b, h, n, d] -> [b, n, h, d]
509
+ key = key.transpose(1, 2)
510
+ value = value.transpose(1, 2)
511
+ if self.attn_mask_enabled and mask is not None:
512
+ query, indices, q_cu_seqlens, q_max_seqlen_in_batch, _ = unpad_input(query, mask)
513
+ key, _, k_cu_seqlens, k_max_seqlen_in_batch, _ = unpad_input(key, mask)
514
+ value, _, _, _, _ = unpad_input(value, mask)
515
+ x = flash_attn_varlen_func(
516
+ query,
517
+ key,
518
+ value,
519
+ q_cu_seqlens,
520
+ k_cu_seqlens,
521
+ q_max_seqlen_in_batch,
522
+ k_max_seqlen_in_batch,
523
+ )
524
+ x = pad_input(x, indices, batch_size, q_max_seqlen_in_batch)
525
+ x = x.reshape(batch_size, -1, attn.heads * head_dim)
526
+ else:
527
+ x = flash_attn_func(query, key, value, dropout_p=0.0, causal=False)
528
+ x = x.reshape(batch_size, -1, attn.heads * head_dim)
529
+
530
+ x = x.to(query.dtype)
531
+
532
+ # linear proj
533
+ x = attn.to_out[0](x)
534
+ # dropout
535
+ x = attn.to_out[1](x)
536
+
537
+ if mask is not None:
538
+ mask = mask.unsqueeze(-1)
539
+ x = x.masked_fill(~mask, 0.0)
540
+
541
+ return x
542
+
543
+
544
+ # Joint Attention processor for MM-DiT
545
+ # modified from diffusers/src/diffusers/models/attention_processor.py
546
+
547
+
548
+ class JointAttnProcessor:
549
+ def __init__(self):
550
+ pass
551
+
552
+ def __call__(
553
+ self,
554
+ attn: Attention,
555
+ x: float["b n d"], # noised input x
556
+ c: float["b nt d"] = None, # context c, here text
557
+ mask: bool["b n"] | None = None,
558
+ rope=None, # rotary position embedding for x
559
+ c_rope=None, # rotary position embedding for c
560
+ ) -> torch.FloatTensor:
561
+ residual = x
562
+
563
+ batch_size = c.shape[0]
564
+
565
+ # `sample` projections
566
+ query = attn.to_q(x)
567
+ key = attn.to_k(x)
568
+ value = attn.to_v(x)
569
+
570
+ # `context` projections
571
+ c_query = attn.to_q_c(c)
572
+ c_key = attn.to_k_c(c)
573
+ c_value = attn.to_v_c(c)
574
+
575
+ # attention
576
+ inner_dim = key.shape[-1]
577
+ head_dim = inner_dim // attn.heads
578
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
579
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
580
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
581
+ c_query = c_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
582
+ c_key = c_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
583
+ c_value = c_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
584
+
585
+ # qk norm
586
+ if attn.q_norm is not None:
587
+ query = attn.q_norm(query)
588
+ if attn.k_norm is not None:
589
+ key = attn.k_norm(key)
590
+ if attn.c_q_norm is not None:
591
+ c_query = attn.c_q_norm(c_query)
592
+ if attn.c_k_norm is not None:
593
+ c_key = attn.c_k_norm(c_key)
594
+
595
+ # apply rope for context and noised input independently
596
+ if rope is not None:
597
+ freqs, xpos_scale = rope
598
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
599
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
600
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
601
+ if c_rope is not None:
602
+ freqs, xpos_scale = c_rope
603
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
604
+ c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
605
+ c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
606
+
607
+ # joint attention
608
+ query = torch.cat([query, c_query], dim=2)
609
+ key = torch.cat([key, c_key], dim=2)
610
+ value = torch.cat([value, c_value], dim=2)
611
+
612
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
613
+ if mask is not None:
614
+ attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
615
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
616
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
617
+ else:
618
+ attn_mask = None
619
+
620
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
621
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
622
+ x = x.to(query.dtype)
623
+
624
+ # Split the attention outputs.
625
+ x, c = (
626
+ x[:, : residual.shape[1]],
627
+ x[:, residual.shape[1] :],
628
+ )
629
+
630
+ # linear proj
631
+ x = attn.to_out[0](x)
632
+ # dropout
633
+ x = attn.to_out[1](x)
634
+ if not attn.context_pre_only:
635
+ c = attn.to_out_c(c)
636
+
637
+ if mask is not None:
638
+ mask = mask.unsqueeze(-1)
639
+ x = x.masked_fill(~mask, 0.0)
640
+ # c = c.masked_fill(~mask, 0.) # no mask for c (text)
641
+
642
+ return x, c
643
+
644
+
645
+ # DiT Block
646
+
647
+
648
+ class DiTBlock(nn.Module):
649
+ def __init__(
650
+ self,
651
+ dim,
652
+ heads,
653
+ dim_head,
654
+ ff_mult=4,
655
+ dropout=0.1,
656
+ qk_norm=None,
657
+ pe_attn_head=None,
658
+ attn_backend="torch", # "torch" or "flash_attn"
659
+ attn_mask_enabled=True,
660
+ ):
661
+ super().__init__()
662
+
663
+ self.attn_norm = AdaLayerNorm(dim)
664
+ self.attn = Attention(
665
+ processor=AttnProcessor(
666
+ pe_attn_head=pe_attn_head,
667
+ attn_backend=attn_backend,
668
+ attn_mask_enabled=attn_mask_enabled,
669
+ ),
670
+ dim=dim,
671
+ heads=heads,
672
+ dim_head=dim_head,
673
+ dropout=dropout,
674
+ qk_norm=qk_norm,
675
+ )
676
+
677
+ self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
678
+ self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
679
+
680
+ def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
681
+ # pre-norm & modulation for attention input
682
+ norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
683
+
684
+ # attention
685
+ attn_output = self.attn(x=norm, mask=mask, rope=rope)
686
+
687
+ # process attention output for input x
688
+ x = x + gate_msa.unsqueeze(1) * attn_output
689
+
690
+ norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
691
+ ff_output = self.ff(norm)
692
+ x = x + gate_mlp.unsqueeze(1) * ff_output
693
+
694
+ return x
695
+
696
+
697
+ # MMDiT Block https://arxiv.org/abs/2403.03206
698
+
699
+
700
+ class MMDiTBlock(nn.Module):
701
+ r"""
702
+ modified from diffusers/src/diffusers/models/attention.py
703
+
704
+ notes.
705
+ _c: context related. text, cond, etc. (left part in sd3 fig2.b)
706
+ _x: noised input related. (right part)
707
+ context_pre_only: last layer only do prenorm + modulation cuz no more ffn
708
+ """
709
+
710
+ def __init__(
711
+ self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_dim=None, context_pre_only=False, qk_norm=None
712
+ ):
713
+ super().__init__()
714
+ if context_dim is None:
715
+ context_dim = dim
716
+ self.context_pre_only = context_pre_only
717
+
718
+ self.attn_norm_c = AdaLayerNorm_Final(context_dim) if context_pre_only else AdaLayerNorm(context_dim)
719
+ self.attn_norm_x = AdaLayerNorm(dim)
720
+ self.attn = Attention(
721
+ processor=JointAttnProcessor(),
722
+ dim=dim,
723
+ heads=heads,
724
+ dim_head=dim_head,
725
+ dropout=dropout,
726
+ context_dim=context_dim,
727
+ context_pre_only=context_pre_only,
728
+ qk_norm=qk_norm,
729
+ )
730
+
731
+ if not context_pre_only:
732
+ self.ff_norm_c = nn.LayerNorm(context_dim, elementwise_affine=False, eps=1e-6)
733
+ self.ff_c = FeedForward(dim=context_dim, mult=ff_mult, dropout=dropout, approximate="tanh")
734
+ else:
735
+ self.ff_norm_c = None
736
+ self.ff_c = None
737
+ self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
738
+ self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
739
+
740
+ def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
741
+ # pre-norm & modulation for attention input
742
+ if self.context_pre_only:
743
+ norm_c = self.attn_norm_c(c, t)
744
+ else:
745
+ norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
746
+ norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
747
+
748
+ # attention
749
+ x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
750
+
751
+ # process attention output for context c
752
+ if self.context_pre_only:
753
+ c = None
754
+ else: # if not last layer
755
+ c = c + c_gate_msa.unsqueeze(1) * c_attn_output
756
+
757
+ norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
758
+ c_ff_output = self.ff_c(norm_c)
759
+ c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
760
+
761
+ # process attention output for input x
762
+ x = x + x_gate_msa.unsqueeze(1) * x_attn_output
763
+
764
+ norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
765
+ x_ff_output = self.ff_x(norm_x)
766
+ x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
767
+
768
+ return c, x
769
+
770
+
771
+ # time step conditioning embedding
772
+
773
+
774
+ class TimestepEmbedding(nn.Module):
775
+ def __init__(self, dim, freq_embed_dim=256):
776
+ super().__init__()
777
+ self.time_embed = SinusPositionEmbedding(freq_embed_dim)
778
+ self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
779
+
780
+ def forward(self, timestep: float["b"]):
781
+ time_hidden = self.time_embed(timestep)
782
+ time_hidden = time_hidden.to(timestep.dtype)
783
+ time = self.time_mlp(time_hidden) # b d
784
+ return time
F5-TTS/src/f5_tts/model/trainer.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import gc
4
+ import math
5
+ import os
6
+
7
+ import torch
8
+ import torchaudio
9
+ import wandb
10
+ from accelerate import Accelerator
11
+ from accelerate.utils import DistributedDataParallelKwargs
12
+ from ema_pytorch import EMA
13
+ from torch.optim import AdamW
14
+ from torch.optim.lr_scheduler import LinearLR, SequentialLR
15
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler
16
+ from tqdm import tqdm
17
+
18
+ from f5_tts.model import CFM
19
+ from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
20
+ from f5_tts.model.utils import default, exists
21
+
22
+
23
+ # trainer
24
+
25
+
26
+ class Trainer:
27
+ def __init__(
28
+ self,
29
+ model: CFM,
30
+ epochs,
31
+ learning_rate,
32
+ num_warmup_updates=20000,
33
+ save_per_updates=1000,
34
+ keep_last_n_checkpoints: int = -1, # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
35
+ checkpoint_path=None,
36
+ batch_size_per_gpu=32,
37
+ batch_size_type: str = "sample",
38
+ max_samples=32,
39
+ grad_accumulation_steps=1,
40
+ max_grad_norm=1.0,
41
+ noise_scheduler: str | None = None,
42
+ duration_predictor: torch.nn.Module | None = None,
43
+ logger: str | None = "wandb", # "wandb" | "tensorboard" | None
44
+ wandb_project="test_f5-tts",
45
+ wandb_run_name="test_run",
46
+ wandb_resume_id: str = None,
47
+ log_samples: bool = False,
48
+ last_per_updates=None,
49
+ accelerate_kwargs: dict = dict(),
50
+ ema_kwargs: dict = dict(),
51
+ bnb_optimizer: bool = False,
52
+ mel_spec_type: str = "vocos", # "vocos" | "bigvgan"
53
+ is_local_vocoder: bool = False, # use local path vocoder
54
+ local_vocoder_path: str = "", # local vocoder path
55
+ model_cfg_dict: dict = dict(), # training config
56
+ ):
57
+ ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
58
+
59
+ if logger == "wandb" and not wandb.api.api_key:
60
+ logger = None
61
+ self.log_samples = log_samples
62
+
63
+ self.accelerator = Accelerator(
64
+ log_with=logger if logger == "wandb" else None,
65
+ kwargs_handlers=[ddp_kwargs],
66
+ gradient_accumulation_steps=grad_accumulation_steps,
67
+ **accelerate_kwargs,
68
+ )
69
+
70
+ self.logger = logger
71
+ if self.logger == "wandb":
72
+ if exists(wandb_resume_id):
73
+ init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
74
+ else:
75
+ init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
76
+
77
+ if not model_cfg_dict:
78
+ model_cfg_dict = {
79
+ "epochs": epochs,
80
+ "learning_rate": learning_rate,
81
+ "num_warmup_updates": num_warmup_updates,
82
+ "batch_size_per_gpu": batch_size_per_gpu,
83
+ "batch_size_type": batch_size_type,
84
+ "max_samples": max_samples,
85
+ "grad_accumulation_steps": grad_accumulation_steps,
86
+ "max_grad_norm": max_grad_norm,
87
+ "noise_scheduler": noise_scheduler,
88
+ }
89
+ model_cfg_dict["gpus"] = self.accelerator.num_processes
90
+ self.accelerator.init_trackers(
91
+ project_name=wandb_project,
92
+ init_kwargs=init_kwargs,
93
+ config=model_cfg_dict,
94
+ )
95
+
96
+ elif self.logger == "tensorboard":
97
+ from torch.utils.tensorboard import SummaryWriter
98
+
99
+ self.writer = SummaryWriter(log_dir=f"runs/{wandb_run_name}")
100
+
101
+ self.model = model
102
+
103
+ if self.is_main:
104
+ self.ema_model = EMA(model, include_online_model=False, **ema_kwargs)
105
+ self.ema_model.to(self.accelerator.device)
106
+
107
+ print(f"Using logger: {logger}")
108
+ if grad_accumulation_steps > 1:
109
+ print(
110
+ "Gradient accumulation checkpointing with per_updates now, old logic per_steps used with before f992c4e"
111
+ )
112
+
113
+ self.epochs = epochs
114
+ self.num_warmup_updates = num_warmup_updates
115
+ self.save_per_updates = save_per_updates
116
+ self.keep_last_n_checkpoints = keep_last_n_checkpoints
117
+ self.last_per_updates = default(last_per_updates, save_per_updates)
118
+ self.checkpoint_path = default(checkpoint_path, "ckpts/test_f5-tts")
119
+
120
+ self.batch_size_per_gpu = batch_size_per_gpu
121
+ self.batch_size_type = batch_size_type
122
+ self.max_samples = max_samples
123
+ self.grad_accumulation_steps = grad_accumulation_steps
124
+ self.max_grad_norm = max_grad_norm
125
+
126
+ # mel vocoder config
127
+ self.vocoder_name = mel_spec_type
128
+ self.is_local_vocoder = is_local_vocoder
129
+ self.local_vocoder_path = local_vocoder_path
130
+
131
+ self.noise_scheduler = noise_scheduler
132
+
133
+ self.duration_predictor = duration_predictor
134
+
135
+ if bnb_optimizer:
136
+ import bitsandbytes as bnb
137
+
138
+ self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
139
+ else:
140
+ self.optimizer = AdamW(model.parameters(), lr=learning_rate)
141
+ self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
142
+
143
+ @property
144
+ def is_main(self):
145
+ return self.accelerator.is_main_process
146
+
147
+ def save_checkpoint(self, update, last=False):
148
+ self.accelerator.wait_for_everyone()
149
+ if self.is_main:
150
+ checkpoint = dict(
151
+ model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
152
+ optimizer_state_dict=self.optimizer.state_dict(),
153
+ ema_model_state_dict=self.ema_model.state_dict(),
154
+ scheduler_state_dict=self.scheduler.state_dict(),
155
+ update=update,
156
+ )
157
+ if not os.path.exists(self.checkpoint_path):
158
+ os.makedirs(self.checkpoint_path)
159
+ if last:
160
+ self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
161
+ print(f"Saved last checkpoint at update {update}")
162
+ else:
163
+ if self.keep_last_n_checkpoints == 0:
164
+ return
165
+ self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{update}.pt")
166
+ if self.keep_last_n_checkpoints > 0:
167
+ # Updated logic to exclude pretrained model from rotation
168
+ checkpoints = [
169
+ f
170
+ for f in os.listdir(self.checkpoint_path)
171
+ if f.startswith("model_")
172
+ and not f.startswith("pretrained_") # Exclude pretrained models
173
+ and f.endswith(".pt")
174
+ and f != "model_last.pt"
175
+ ]
176
+ checkpoints.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
177
+ while len(checkpoints) > self.keep_last_n_checkpoints:
178
+ oldest_checkpoint = checkpoints.pop(0)
179
+ os.remove(os.path.join(self.checkpoint_path, oldest_checkpoint))
180
+ print(f"Removed old checkpoint: {oldest_checkpoint}")
181
+
182
+ def load_checkpoint(self):
183
+ if (
184
+ not exists(self.checkpoint_path)
185
+ or not os.path.exists(self.checkpoint_path)
186
+ or not any(filename.endswith((".pt", ".safetensors")) for filename in os.listdir(self.checkpoint_path))
187
+ ):
188
+ return 0
189
+
190
+ self.accelerator.wait_for_everyone()
191
+ if "model_last.pt" in os.listdir(self.checkpoint_path):
192
+ latest_checkpoint = "model_last.pt"
193
+ else:
194
+ # Updated to consider pretrained models for loading but prioritize training checkpoints
195
+ all_checkpoints = [
196
+ f
197
+ for f in os.listdir(self.checkpoint_path)
198
+ if (f.startswith("model_") or f.startswith("pretrained_")) and f.endswith((".pt", ".safetensors"))
199
+ ]
200
+
201
+ # First try to find regular training checkpoints
202
+ training_checkpoints = [f for f in all_checkpoints if f.startswith("model_") and f != "model_last.pt"]
203
+ if training_checkpoints:
204
+ latest_checkpoint = sorted(
205
+ training_checkpoints,
206
+ key=lambda x: int("".join(filter(str.isdigit, x))),
207
+ )[-1]
208
+ else:
209
+ # If no training checkpoints, use pretrained model
210
+ latest_checkpoint = next(f for f in all_checkpoints if f.startswith("pretrained_"))
211
+
212
+ if latest_checkpoint.endswith(".safetensors"): # always a pretrained checkpoint
213
+ from safetensors.torch import load_file
214
+
215
+ checkpoint = load_file(f"{self.checkpoint_path}/{latest_checkpoint}", device="cpu")
216
+ checkpoint = {"ema_model_state_dict": checkpoint}
217
+ elif latest_checkpoint.endswith(".pt"):
218
+ # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device) # rather use accelerator.load_state ಥ_ಥ
219
+ checkpoint = torch.load(
220
+ f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu"
221
+ )
222
+
223
+ # patch for backward compatibility, 305e3ea
224
+ for key in ["ema_model.mel_spec.mel_stft.mel_scale.fb", "ema_model.mel_spec.mel_stft.spectrogram.window"]:
225
+ if key in checkpoint["ema_model_state_dict"]:
226
+ del checkpoint["ema_model_state_dict"][key]
227
+
228
+ if self.is_main:
229
+ self.ema_model.load_state_dict(checkpoint["ema_model_state_dict"])
230
+
231
+ if "update" in checkpoint or "step" in checkpoint:
232
+ # patch for backward compatibility, with before f992c4e
233
+ if "step" in checkpoint:
234
+ checkpoint["update"] = checkpoint["step"] // self.grad_accumulation_steps
235
+ if self.grad_accumulation_steps > 1 and self.is_main:
236
+ print(
237
+ "F5-TTS WARNING: Loading checkpoint saved with per_steps logic (before f992c4e), will convert to per_updates according to grad_accumulation_steps setting, may have unexpected behaviour."
238
+ )
239
+ # patch for backward compatibility, 305e3ea
240
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
241
+ if key in checkpoint["model_state_dict"]:
242
+ del checkpoint["model_state_dict"][key]
243
+
244
+ self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
245
+ self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
246
+ if self.scheduler:
247
+ self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
248
+ update = checkpoint["update"]
249
+ else:
250
+ checkpoint["model_state_dict"] = {
251
+ k.replace("ema_model.", ""): v
252
+ for k, v in checkpoint["ema_model_state_dict"].items()
253
+ if k not in ["initted", "update", "step"]
254
+ }
255
+ self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
256
+ update = 0
257
+
258
+ del checkpoint
259
+ gc.collect()
260
+ return update
261
+
262
+ def train(self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None):
263
+ if self.log_samples:
264
+ from f5_tts.infer.utils_infer import cfg_strength, load_vocoder, nfe_step, sway_sampling_coef
265
+
266
+ vocoder = load_vocoder(
267
+ vocoder_name=self.vocoder_name, is_local=self.is_local_vocoder, local_path=self.local_vocoder_path
268
+ )
269
+ target_sample_rate = self.accelerator.unwrap_model(self.model).mel_spec.target_sample_rate
270
+ log_samples_path = f"{self.checkpoint_path}/samples"
271
+ os.makedirs(log_samples_path, exist_ok=True)
272
+
273
+ if exists(resumable_with_seed):
274
+ generator = torch.Generator()
275
+ generator.manual_seed(resumable_with_seed)
276
+ else:
277
+ generator = None
278
+
279
+ if self.batch_size_type == "sample":
280
+ train_dataloader = DataLoader(
281
+ train_dataset,
282
+ collate_fn=collate_fn,
283
+ num_workers=num_workers,
284
+ pin_memory=True,
285
+ persistent_workers=True,
286
+ batch_size=self.batch_size_per_gpu,
287
+ shuffle=True,
288
+ generator=generator,
289
+ )
290
+ elif self.batch_size_type == "frame":
291
+ self.accelerator.even_batches = False
292
+ sampler = SequentialSampler(train_dataset)
293
+ batch_sampler = DynamicBatchSampler(
294
+ sampler,
295
+ self.batch_size_per_gpu,
296
+ max_samples=self.max_samples,
297
+ random_seed=resumable_with_seed, # This enables reproducible shuffling
298
+ drop_residual=False,
299
+ )
300
+ train_dataloader = DataLoader(
301
+ train_dataset,
302
+ collate_fn=collate_fn,
303
+ num_workers=num_workers,
304
+ pin_memory=True,
305
+ persistent_workers=True,
306
+ batch_sampler=batch_sampler,
307
+ )
308
+ else:
309
+ raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
310
+
311
+ # accelerator.prepare() dispatches batches to devices;
312
+ # which means the length of dataloader calculated before, should consider the number of devices
313
+ warmup_updates = (
314
+ self.num_warmup_updates * self.accelerator.num_processes
315
+ ) # consider a fixed warmup steps while using accelerate multi-gpu ddp
316
+ # otherwise by default with split_batches=False, warmup steps change with num_processes
317
+ total_updates = math.ceil(len(train_dataloader) / self.grad_accumulation_steps) * self.epochs
318
+ decay_updates = total_updates - warmup_updates
319
+ warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_updates)
320
+ decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_updates)
321
+ self.scheduler = SequentialLR(
322
+ self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_updates]
323
+ )
324
+ train_dataloader, self.scheduler = self.accelerator.prepare(
325
+ train_dataloader, self.scheduler
326
+ ) # actual multi_gpu updates = single_gpu updates / gpu nums
327
+ start_update = self.load_checkpoint()
328
+ global_update = start_update
329
+
330
+ if exists(resumable_with_seed):
331
+ orig_epoch_step = len(train_dataloader)
332
+ start_step = start_update * self.grad_accumulation_steps
333
+ skipped_epoch = int(start_step // orig_epoch_step)
334
+ skipped_batch = start_step % orig_epoch_step
335
+ skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
336
+ else:
337
+ skipped_epoch = 0
338
+
339
+ for epoch in range(skipped_epoch, self.epochs):
340
+ self.model.train()
341
+ if exists(resumable_with_seed) and epoch == skipped_epoch:
342
+ progress_bar_initial = math.ceil(skipped_batch / self.grad_accumulation_steps)
343
+ current_dataloader = skipped_dataloader
344
+ else:
345
+ progress_bar_initial = 0
346
+ current_dataloader = train_dataloader
347
+
348
+ # Set epoch for the batch sampler if it exists
349
+ if hasattr(train_dataloader, "batch_sampler") and hasattr(train_dataloader.batch_sampler, "set_epoch"):
350
+ train_dataloader.batch_sampler.set_epoch(epoch)
351
+
352
+ progress_bar = tqdm(
353
+ range(math.ceil(len(train_dataloader) / self.grad_accumulation_steps)),
354
+ desc=f"Epoch {epoch + 1}/{self.epochs}",
355
+ unit="update",
356
+ disable=not self.accelerator.is_local_main_process,
357
+ initial=progress_bar_initial,
358
+ )
359
+
360
+ for batch in current_dataloader:
361
+ with self.accelerator.accumulate(self.model):
362
+ text_inputs = batch["text"]
363
+ mel_spec = batch["mel"].permute(0, 2, 1)
364
+ mel_lengths = batch["mel_lengths"]
365
+
366
+ # TODO. add duration predictor training
367
+ if self.duration_predictor is not None and self.accelerator.is_local_main_process:
368
+ dur_loss = self.duration_predictor(mel_spec, lens=batch.get("durations"))
369
+ self.accelerator.log({"duration loss": dur_loss.item()}, step=global_update)
370
+
371
+ loss, cond, pred = self.model(
372
+ mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler
373
+ )
374
+ self.accelerator.backward(loss)
375
+
376
+ if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
377
+ self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
378
+
379
+ self.optimizer.step()
380
+ self.scheduler.step()
381
+ self.optimizer.zero_grad()
382
+
383
+ if self.accelerator.sync_gradients:
384
+ if self.is_main:
385
+ self.ema_model.update()
386
+
387
+ global_update += 1
388
+ progress_bar.update(1)
389
+ progress_bar.set_postfix(update=str(global_update), loss=loss.item())
390
+
391
+ if self.accelerator.is_local_main_process:
392
+ self.accelerator.log(
393
+ {"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_update
394
+ )
395
+ if self.logger == "tensorboard":
396
+ self.writer.add_scalar("loss", loss.item(), global_update)
397
+ self.writer.add_scalar("lr", self.scheduler.get_last_lr()[0], global_update)
398
+
399
+ if global_update % self.last_per_updates == 0 and self.accelerator.sync_gradients:
400
+ self.save_checkpoint(global_update, last=True)
401
+
402
+ if global_update % self.save_per_updates == 0 and self.accelerator.sync_gradients:
403
+ self.save_checkpoint(global_update)
404
+
405
+ if self.log_samples and self.accelerator.is_local_main_process:
406
+ ref_audio_len = mel_lengths[0]
407
+ infer_text = [
408
+ text_inputs[0] + ([" "] if isinstance(text_inputs[0], list) else " ") + text_inputs[0]
409
+ ]
410
+ with torch.inference_mode():
411
+ generated, _ = self.accelerator.unwrap_model(self.model).sample(
412
+ cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
413
+ text=infer_text,
414
+ duration=ref_audio_len * 2,
415
+ steps=nfe_step,
416
+ cfg_strength=cfg_strength,
417
+ sway_sampling_coef=sway_sampling_coef,
418
+ )
419
+ generated = generated.to(torch.float32)
420
+ gen_mel_spec = generated[:, ref_audio_len:, :].permute(0, 2, 1).to(self.accelerator.device)
421
+ ref_mel_spec = batch["mel"][0].unsqueeze(0)
422
+ if self.vocoder_name == "vocos":
423
+ gen_audio = vocoder.decode(gen_mel_spec).cpu()
424
+ ref_audio = vocoder.decode(ref_mel_spec).cpu()
425
+ elif self.vocoder_name == "bigvgan":
426
+ gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
427
+ ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
428
+
429
+ torchaudio.save(
430
+ f"{log_samples_path}/update_{global_update}_gen.wav", gen_audio, target_sample_rate
431
+ )
432
+ torchaudio.save(
433
+ f"{log_samples_path}/update_{global_update}_ref.wav", ref_audio, target_sample_rate
434
+ )
435
+ self.model.train()
436
+
437
+ self.save_checkpoint(global_update, last=True)
438
+
439
+ self.accelerator.end_training()
F5-TTS/src/f5_tts/model/utils.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ from collections import defaultdict
6
+ from importlib.resources import files
7
+
8
+ import jieba
9
+ import torch
10
+ from pypinyin import Style, lazy_pinyin
11
+ from torch.nn.utils.rnn import pad_sequence
12
+
13
+
14
+ # seed everything
15
+
16
+
17
+ def seed_everything(seed=0):
18
+ random.seed(seed)
19
+ os.environ["PYTHONHASHSEED"] = str(seed)
20
+ torch.manual_seed(seed)
21
+ torch.cuda.manual_seed(seed)
22
+ torch.cuda.manual_seed_all(seed)
23
+ torch.backends.cudnn.deterministic = True
24
+ torch.backends.cudnn.benchmark = False
25
+
26
+
27
+ # helpers
28
+
29
+
30
+ def exists(v):
31
+ return v is not None
32
+
33
+
34
+ def default(v, d):
35
+ return v if exists(v) else d
36
+
37
+
38
+ def is_package_available(package_name: str) -> bool:
39
+ try:
40
+ import importlib
41
+
42
+ package_exists = importlib.util.find_spec(package_name) is not None
43
+ return package_exists
44
+ except Exception:
45
+ return False
46
+
47
+
48
+ # tensor helpers
49
+
50
+
51
+ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]: # noqa: F722 F821
52
+ if not exists(length):
53
+ length = t.amax()
54
+
55
+ seq = torch.arange(length, device=t.device)
56
+ return seq[None, :] < t[:, None]
57
+
58
+
59
+ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]): # noqa: F722 F821
60
+ max_seq_len = seq_len.max().item()
61
+ seq = torch.arange(max_seq_len, device=start.device).long()
62
+ start_mask = seq[None, :] >= start[:, None]
63
+ end_mask = seq[None, :] < end[:, None]
64
+ return start_mask & end_mask
65
+
66
+
67
+ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]): # noqa: F722 F821
68
+ lengths = (frac_lengths * seq_len).long()
69
+ max_start = seq_len - lengths
70
+
71
+ rand = torch.rand_like(frac_lengths)
72
+ start = (max_start * rand).long().clamp(min=0)
73
+ end = start + lengths
74
+
75
+ return mask_from_start_end_indices(seq_len, start, end)
76
+
77
+
78
+ def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]: # noqa: F722
79
+ if not exists(mask):
80
+ return t.mean(dim=1)
81
+
82
+ t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
83
+ num = t.sum(dim=1)
84
+ den = mask.float().sum(dim=1)
85
+
86
+ return num / den.clamp(min=1.0)
87
+
88
+
89
+ # simple utf-8 tokenizer, since paper went character based
90
+ def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]: # noqa: F722
91
+ list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text] # ByT5 style
92
+ text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
93
+ return text
94
+
95
+
96
+ # char tokenizer, based on custom dataset's extracted .txt file
97
+ def list_str_to_idx(
98
+ text: list[str] | list[list[str]],
99
+ vocab_char_map: dict[str, int], # {char: idx}
100
+ padding_value=-1,
101
+ ) -> int["b nt"]: # noqa: F722
102
+ list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style
103
+ text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
104
+ return text
105
+
106
+
107
+ # Get tokenizer
108
+
109
+
110
+ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
111
+ """
112
+ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file
113
+ - "char" for char-wise tokenizer, need .txt vocab_file
114
+ - "byte" for utf-8 tokenizer
115
+ - "custom" if you're directly passing in a path to the vocab.txt you want to use
116
+ vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
117
+ - if use "char", derived from unfiltered character & symbol counts of custom dataset
118
+ - if use "byte", set to 256 (unicode byte range)
119
+ """
120
+ if tokenizer in ["pinyin", "char"]:
121
+ tokenizer_path = os.path.join(files("f5_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
122
+ with open(tokenizer_path, "r", encoding="utf-8") as f:
123
+ vocab_char_map = {}
124
+ for i, char in enumerate(f):
125
+ vocab_char_map[char[:-1]] = i
126
+ vocab_size = len(vocab_char_map)
127
+ assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
128
+
129
+ elif tokenizer == "byte":
130
+ vocab_char_map = None
131
+ vocab_size = 256
132
+
133
+ elif tokenizer == "custom":
134
+ with open(dataset_name, "r", encoding="utf-8") as f:
135
+ vocab_char_map = {}
136
+ for i, char in enumerate(f):
137
+ vocab_char_map[char[:-1]] = i
138
+ vocab_size = len(vocab_char_map)
139
+
140
+ return vocab_char_map, vocab_size
141
+
142
+
143
+ # convert char to pinyin
144
+
145
+
146
+ def convert_char_to_pinyin(text_list, polyphone=True):
147
+ if jieba.dt.initialized is False:
148
+ jieba.default_logger.setLevel(50) # CRITICAL
149
+ jieba.initialize()
150
+
151
+ final_text_list = []
152
+ custom_trans = str.maketrans(
153
+ {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
154
+ ) # add custom trans here, to address oov
155
+
156
+ def is_chinese(c):
157
+ return (
158
+ "\u3100" <= c <= "\u9fff" # common chinese characters
159
+ )
160
+
161
+ for text in text_list:
162
+ char_list = []
163
+ text = text.translate(custom_trans)
164
+ for seg in jieba.cut(text):
165
+ seg_byte_len = len(bytes(seg, "UTF-8"))
166
+ if seg_byte_len == len(seg): # if pure alphabets and symbols
167
+ if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
168
+ char_list.append(" ")
169
+ char_list.extend(seg)
170
+ elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters
171
+ seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
172
+ for i, c in enumerate(seg):
173
+ if is_chinese(c):
174
+ char_list.append(" ")
175
+ char_list.append(seg_[i])
176
+ else: # if mixed characters, alphabets and symbols
177
+ for c in seg:
178
+ if ord(c) < 256:
179
+ char_list.extend(c)
180
+ elif is_chinese(c):
181
+ char_list.append(" ")
182
+ char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
183
+ else:
184
+ char_list.append(c)
185
+ final_text_list.append(char_list)
186
+
187
+ return final_text_list
188
+
189
+
190
+ # filter func for dirty data with many repetitions
191
+
192
+
193
+ def repetition_found(text, length=2, tolerance=10):
194
+ pattern_count = defaultdict(int)
195
+ for i in range(len(text) - length + 1):
196
+ pattern = text[i : i + length]
197
+ pattern_count[pattern] += 1
198
+ for pattern, count in pattern_count.items():
199
+ if count > tolerance:
200
+ return True
201
+ return False
202
+
203
+
204
+ # get the empirically pruned step for sampling
205
+
206
+
207
+ def get_epss_timesteps(n, device, dtype):
208
+ dt = 1 / 32
209
+ predefined_timesteps = {
210
+ 5: [0, 2, 4, 8, 16, 32],
211
+ 6: [0, 2, 4, 6, 8, 16, 32],
212
+ 7: [0, 2, 4, 6, 8, 16, 24, 32],
213
+ 10: [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
214
+ 12: [0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32],
215
+ 16: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32],
216
+ }
217
+ t = predefined_timesteps.get(n, [])
218
+ if not t:
219
+ return torch.linspace(0, 1, n + 1, device=device, dtype=dtype)
220
+ return dt * torch.tensor(t, device=device, dtype=dtype)
F5-TTS/src/f5_tts/runtime/triton_trtllm/Dockerfile.server ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ FROM nvcr.io/nvidia/tritonserver:24.12-py3
2
+ RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 jieba pypinyin librosa vocos
3
+ WORKDIR /workspace
F5-TTS/src/f5_tts/runtime/triton_trtllm/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Triton Inference Serving Best Practice for F5-TTS
2
+
3
+ ### Quick Start
4
+ Directly launch the service using docker compose.
5
+ ```sh
6
+ # TODO: support F5TTS_v1_Base
7
+ MODEL=F5TTS_Base docker compose up
8
+ ```
9
+
10
+ ### Build Image
11
+ Build the docker image from scratch.
12
+ ```sh
13
+ docker build . -f Dockerfile.server -t soar97/triton-f5-tts:24.12
14
+ ```
15
+
16
+ ### Create Docker Container
17
+ ```sh
18
+ your_mount_dir=/mnt:/mnt
19
+ docker run -it --name "f5-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-f5-tts:24.12
20
+ ```
21
+
22
+ ### Export Models to TensorRT-LLM and Launch Server
23
+ Inside docker container, we would follow the official guide of TensorRT-LLM to build qwen and whisper TensorRT-LLM engines. See [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper).
24
+ ```sh
25
+ bash run.sh 0 4 F5TTS_Base
26
+ ```
27
+
28
+ ### HTTP Client
29
+ ```sh
30
+ python3 client_http.py
31
+ ```
32
+
33
+ ### Benchmark using Client-Server Mode
34
+ ```sh
35
+ num_task=2
36
+ python3 client_grpc.py --num-tasks $num_task --huggingface-dataset yuekai/seed_tts --split-name wenetspeech4tts
37
+ ```
38
+
39
+ ### Benchmark using Offline TRT-LLM Mode
40
+ ```sh
41
+ batch_size=1
42
+ split_name=wenetspeech4tts
43
+ backend_type=trt
44
+ log_dir=./log_benchmark_batch_size_${batch_size}_${split_name}_${backend_type}
45
+ rm -r $log_dir
46
+ ln -s model_repo_f5_tts/f5_tts/1/f5_tts_trtllm.py ./
47
+ torchrun --nproc_per_node=1 \
48
+ benchmark.py --output-dir $log_dir \
49
+ --batch-size $batch_size \
50
+ --enable-warmup \
51
+ --split-name $split_name \
52
+ --model-path $F5_TTS_HF_DOWNLOAD_PATH/$model/model_1200000.pt \
53
+ --vocab-file $F5_TTS_HF_DOWNLOAD_PATH/$model/vocab.txt \
54
+ --vocoder-trt-engine-path $vocoder_trt_engine_path \
55
+ --backend-type $backend_type \
56
+ --tllm-model-dir $F5_TTS_TRT_LLM_ENGINE_PATH || exit 1
57
+ ```
58
+
59
+ ### Benchmark Results
60
+ Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
61
+
62
+ | Model | Concurrency | Avg Latency | RTF | Mode |
63
+ |---------------------|----------------|-------------|--------|-----------------|
64
+ | F5-TTS Base (Vocos) | 2 | 253 ms | 0.0394 | Client-Server |
65
+ | F5-TTS Base (Vocos) | 1 (Batch_size) | - | 0.0402 | Offline TRT-LLM |
66
+ | F5-TTS Base (Vocos) | 1 (Batch_size) | - | 0.1467 | Offline Pytorch |
67
+
68
+ ### Credits
69
+ 1. [F5-TTS-TRTLLM](https://github.com/Bigfishering/f5-tts-trtllm)
F5-TTS/src/f5_tts/runtime/triton_trtllm/benchmark.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Tsinghua Univ. (authors: Xingchen Song)
2
+ # 2025 (authors: Yuekai Zhang)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # Modified from https://github.com/xingchensong/S3Tokenizer/blob/main/s3tokenizer/cli.py
16
+ """ Example Usage
17
+ torchrun --nproc_per_node=1 \
18
+ benchmark.py --output-dir $log_dir \
19
+ --batch-size $batch_size \
20
+ --enable-warmup \
21
+ --split-name $split_name \
22
+ --model-path $F5_TTS_HF_DOWNLOAD_PATH/$model/model_1200000.pt \
23
+ --vocab-file $F5_TTS_HF_DOWNLOAD_PATH/$model/vocab.txt \
24
+ --vocoder-trt-engine-path $vocoder_trt_engine_path \
25
+ --backend-type $backend_type \
26
+ --tllm-model-dir $F5_TTS_TRT_LLM_ENGINE_PATH || exit 1
27
+ """
28
+
29
+ import argparse
30
+ import json
31
+ import os
32
+ import time
33
+ from typing import Dict, List, Union
34
+
35
+ import datasets
36
+ import jieba
37
+ import tensorrt as trt
38
+ import torch
39
+ import torch.distributed as dist
40
+ import torch.nn.functional as F
41
+ import torchaudio
42
+ from datasets import load_dataset
43
+ from f5_tts_trtllm import F5TTS
44
+ from huggingface_hub import hf_hub_download
45
+ from pypinyin import Style, lazy_pinyin
46
+ from tensorrt_llm._utils import trt_dtype_to_torch
47
+ from tensorrt_llm.logger import logger
48
+ from tensorrt_llm.runtime.session import Session, TensorInfo
49
+ from torch.nn.utils.rnn import pad_sequence
50
+ from torch.utils.data import DataLoader, DistributedSampler
51
+ from tqdm import tqdm
52
+ from vocos import Vocos
53
+
54
+
55
+ torch.manual_seed(0)
56
+
57
+
58
+ def get_args():
59
+ parser = argparse.ArgumentParser(description="extract speech code")
60
+ parser.add_argument(
61
+ "--split-name",
62
+ type=str,
63
+ default="wenetspeech4tts",
64
+ choices=["wenetspeech4tts", "test_zh", "test_en", "test_hard"],
65
+ help="huggingface dataset split name",
66
+ )
67
+ parser.add_argument("--output-dir", required=True, type=str, help="dir to save result")
68
+ parser.add_argument(
69
+ "--vocab-file",
70
+ required=True,
71
+ type=str,
72
+ help="vocab file",
73
+ )
74
+ parser.add_argument(
75
+ "--model-path",
76
+ required=True,
77
+ type=str,
78
+ help="model path, to load text embedding",
79
+ )
80
+ parser.add_argument(
81
+ "--tllm-model-dir",
82
+ required=True,
83
+ type=str,
84
+ help="tllm model dir",
85
+ )
86
+ parser.add_argument(
87
+ "--batch-size",
88
+ required=True,
89
+ type=int,
90
+ help="batch size (per-device) for inference",
91
+ )
92
+ parser.add_argument("--num-workers", type=int, default=0, help="workers for dataloader")
93
+ parser.add_argument("--prefetch", type=int, default=None, help="prefetch for dataloader")
94
+ parser.add_argument(
95
+ "--vocoder",
96
+ default="vocos",
97
+ type=str,
98
+ help="vocoder name",
99
+ )
100
+ parser.add_argument(
101
+ "--vocoder-trt-engine-path",
102
+ default=None,
103
+ type=str,
104
+ help="vocoder trt engine path",
105
+ )
106
+ parser.add_argument("--enable-warmup", action="store_true")
107
+ parser.add_argument("--remove-input-padding", action="store_true")
108
+ parser.add_argument("--use-perf", action="store_true", help="use nvtx to record performance")
109
+ parser.add_argument("--backend-type", type=str, default="triton", choices=["trt", "pytorch"], help="backend type")
110
+ args = parser.parse_args()
111
+ return args
112
+
113
+
114
+ def padded_mel_batch(ref_mels, max_seq_len):
115
+ padded_ref_mels = []
116
+ for mel in ref_mels:
117
+ # pad along the last dimension
118
+ padded_ref_mel = F.pad(mel, (0, 0, 0, max_seq_len - mel.shape[0]), value=0)
119
+ padded_ref_mels.append(padded_ref_mel)
120
+ padded_ref_mels = torch.stack(padded_ref_mels)
121
+ return padded_ref_mels
122
+
123
+
124
+ def data_collator(batch, vocab_char_map, device="cuda", use_perf=False):
125
+ if use_perf:
126
+ torch.cuda.nvtx.range_push("data_collator")
127
+ target_sample_rate = 24000
128
+ target_rms = 0.1
129
+ ids, ref_mel_list, ref_mel_len_list, estimated_reference_target_mel_len, reference_target_texts_list = (
130
+ [],
131
+ [],
132
+ [],
133
+ [],
134
+ [],
135
+ )
136
+ for i, item in enumerate(batch):
137
+ item_id, prompt_text, target_text = (
138
+ item["id"],
139
+ item["prompt_text"],
140
+ item["target_text"],
141
+ )
142
+ ids.append(item_id)
143
+ reference_target_texts_list.append(prompt_text + target_text)
144
+
145
+ ref_audio_org, ref_sr = (
146
+ item["prompt_audio"]["array"],
147
+ item["prompt_audio"]["sampling_rate"],
148
+ )
149
+ ref_audio_org = torch.from_numpy(ref_audio_org).unsqueeze(0).float()
150
+ ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio_org)))
151
+ if ref_rms < target_rms:
152
+ ref_audio_org = ref_audio_org * target_rms / ref_rms
153
+
154
+ if ref_sr != target_sample_rate:
155
+ resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
156
+ ref_audio = resampler(ref_audio_org)
157
+ else:
158
+ ref_audio = ref_audio_org
159
+
160
+ if use_perf:
161
+ torch.cuda.nvtx.range_push(f"mel_spectrogram {i}")
162
+ ref_mel = mel_spectrogram(ref_audio, vocoder="vocos", device="cuda")
163
+ if use_perf:
164
+ torch.cuda.nvtx.range_pop()
165
+ ref_mel = ref_mel.squeeze()
166
+ ref_mel_len = ref_mel.shape[0]
167
+ assert ref_mel.shape[1] == 100
168
+
169
+ ref_mel_list.append(ref_mel)
170
+ ref_mel_len_list.append(ref_mel_len)
171
+
172
+ estimated_reference_target_mel_len.append(
173
+ int(ref_mel.shape[0] * (1 + len(target_text.encode("utf-8")) / len(prompt_text.encode("utf-8"))))
174
+ )
175
+
176
+ max_seq_len = max(estimated_reference_target_mel_len)
177
+ ref_mel_batch = padded_mel_batch(ref_mel_list, max_seq_len)
178
+ ref_mel_len_batch = torch.LongTensor(ref_mel_len_list)
179
+
180
+ pinyin_list = convert_char_to_pinyin(reference_target_texts_list, polyphone=True)
181
+ text_pad_sequence = list_str_to_idx(pinyin_list, vocab_char_map)
182
+
183
+ for i, item in enumerate(text_pad_sequence):
184
+ text_pad_sequence[i] = F.pad(
185
+ item, (0, estimated_reference_target_mel_len[i] - len(item)), mode="constant", value=-1
186
+ )
187
+ text_pad_sequence[i] += 1 # WAR: 0 is reserved for padding token, hard coding in F5-TTS
188
+ text_pad_sequence = pad_sequence(text_pad_sequence, padding_value=-1, batch_first=True).to(device)
189
+ text_pad_sequence = F.pad(
190
+ text_pad_sequence, (0, max_seq_len - text_pad_sequence.shape[1]), mode="constant", value=-1
191
+ )
192
+ if use_perf:
193
+ torch.cuda.nvtx.range_pop()
194
+ return {
195
+ "ids": ids,
196
+ "ref_mel_batch": ref_mel_batch,
197
+ "ref_mel_len_batch": ref_mel_len_batch,
198
+ "text_pad_sequence": text_pad_sequence,
199
+ "estimated_reference_target_mel_len": estimated_reference_target_mel_len,
200
+ }
201
+
202
+
203
+ def init_distributed():
204
+ world_size = int(os.environ.get("WORLD_SIZE", 1))
205
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
206
+ rank = int(os.environ.get("RANK", 0))
207
+ print(
208
+ "Inference on multiple gpus, this gpu {}".format(local_rank)
209
+ + ", rank {}, world_size {}".format(rank, world_size)
210
+ )
211
+ torch.cuda.set_device(local_rank)
212
+ # Initialize process group with explicit device IDs
213
+ dist.init_process_group(
214
+ "nccl",
215
+ )
216
+ return world_size, local_rank, rank
217
+
218
+
219
+ def get_tokenizer(vocab_file_path: str):
220
+ """
221
+ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file
222
+ - "char" for char-wise tokenizer, need .txt vocab_file
223
+ - "byte" for utf-8 tokenizer
224
+ - "custom" if you're directly passing in a path to the vocab.txt you want to use
225
+ vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
226
+ - if use "char", derived from unfiltered character & symbol counts of custom dataset
227
+ - if use "byte", set to 256 (unicode byte range)
228
+ """
229
+ with open(vocab_file_path, "r", encoding="utf-8") as f:
230
+ vocab_char_map = {}
231
+ for i, char in enumerate(f):
232
+ vocab_char_map[char[:-1]] = i
233
+ vocab_size = len(vocab_char_map)
234
+ return vocab_char_map, vocab_size
235
+
236
+
237
+ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
238
+ final_reference_target_texts_list = []
239
+ custom_trans = str.maketrans(
240
+ {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
241
+ ) # add custom trans here, to address oov
242
+
243
+ def is_chinese(c):
244
+ return "\u3100" <= c <= "\u9fff" # common chinese characters
245
+
246
+ for text in reference_target_texts_list:
247
+ char_list = []
248
+ text = text.translate(custom_trans)
249
+ for seg in jieba.cut(text):
250
+ seg_byte_len = len(bytes(seg, "UTF-8"))
251
+ if seg_byte_len == len(seg): # if pure alphabets and symbols
252
+ if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
253
+ char_list.append(" ")
254
+ char_list.extend(seg)
255
+ elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters
256
+ seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
257
+ for i, c in enumerate(seg):
258
+ if is_chinese(c):
259
+ char_list.append(" ")
260
+ char_list.append(seg_[i])
261
+ else: # if mixed characters, alphabets and symbols
262
+ for c in seg:
263
+ if ord(c) < 256:
264
+ char_list.extend(c)
265
+ elif is_chinese(c):
266
+ char_list.append(" ")
267
+ char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
268
+ else:
269
+ char_list.append(c)
270
+ final_reference_target_texts_list.append(char_list)
271
+
272
+ return final_reference_target_texts_list
273
+
274
+
275
+ def list_str_to_idx(
276
+ text: Union[List[str], List[List[str]]],
277
+ vocab_char_map: Dict[str, int], # {char: idx}
278
+ padding_value=-1,
279
+ ):
280
+ list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style
281
+ # text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
282
+ return list_idx_tensors
283
+
284
+
285
+ def load_vocoder(
286
+ vocoder_name="vocos", is_local=False, local_path="", device="cuda", hf_cache_dir=None, vocoder_trt_engine_path=None
287
+ ):
288
+ if vocoder_name == "vocos":
289
+ if vocoder_trt_engine_path is not None:
290
+ vocoder = VocosTensorRT(engine_path=vocoder_trt_engine_path)
291
+ else:
292
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
293
+ if is_local:
294
+ print(f"Load vocos from local path {local_path}")
295
+ config_path = f"{local_path}/config.yaml"
296
+ model_path = f"{local_path}/pytorch_model.bin"
297
+ else:
298
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
299
+ repo_id = "charactr/vocos-mel-24khz"
300
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
301
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
302
+ vocoder = Vocos.from_hparams(config_path)
303
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
304
+ from vocos.feature_extractors import EncodecFeatures
305
+
306
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
307
+ encodec_parameters = {
308
+ "feature_extractor.encodec." + key: value
309
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
310
+ }
311
+ state_dict.update(encodec_parameters)
312
+ vocoder.load_state_dict(state_dict)
313
+ vocoder = vocoder.eval().to(device)
314
+ elif vocoder_name == "bigvgan":
315
+ raise NotImplementedError("BigVGAN is not implemented yet")
316
+ return vocoder
317
+
318
+
319
+ def mel_spectrogram(waveform, vocoder="vocos", device="cuda"):
320
+ if vocoder == "vocos":
321
+ mel_stft = torchaudio.transforms.MelSpectrogram(
322
+ sample_rate=24000,
323
+ n_fft=1024,
324
+ win_length=1024,
325
+ hop_length=256,
326
+ n_mels=100,
327
+ power=1,
328
+ center=True,
329
+ normalized=False,
330
+ norm=None,
331
+ ).to(device)
332
+ mel = mel_stft(waveform.to(device))
333
+ mel = mel.clamp(min=1e-5).log()
334
+ return mel.transpose(1, 2)
335
+
336
+
337
+ class VocosTensorRT:
338
+ def __init__(self, engine_path="./vocos_vocoder.plan", stream=None):
339
+ TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
340
+ trt.init_libnvinfer_plugins(TRT_LOGGER, namespace="")
341
+ logger.info(f"Loading vae engine from {engine_path}")
342
+ self.engine_path = engine_path
343
+ with open(engine_path, "rb") as f:
344
+ engine_buffer = f.read()
345
+ self.session = Session.from_serialized_engine(engine_buffer)
346
+ self.stream = stream if stream is not None else torch.cuda.current_stream().cuda_stream
347
+
348
+ def decode(self, mels):
349
+ mels = mels.contiguous()
350
+ inputs = {"mel": mels}
351
+ output_info = self.session.infer_shapes([TensorInfo("mel", trt.DataType.FLOAT, mels.shape)])
352
+ outputs = {
353
+ t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device="cuda") for t in output_info
354
+ }
355
+ ok = self.session.run(inputs, outputs, self.stream)
356
+
357
+ assert ok, "Runtime execution failed for vae session"
358
+
359
+ samples = outputs["waveform"]
360
+ return samples
361
+
362
+
363
+ def main():
364
+ args = get_args()
365
+ os.makedirs(args.output_dir, exist_ok=True)
366
+
367
+ assert torch.cuda.is_available()
368
+ world_size, local_rank, rank = init_distributed()
369
+ device = torch.device(f"cuda:{local_rank}")
370
+
371
+ vocab_char_map, vocab_size = get_tokenizer(args.vocab_file)
372
+
373
+ tllm_model_dir = args.tllm_model_dir
374
+ config_file = os.path.join(tllm_model_dir, "config.json")
375
+ with open(config_file) as f:
376
+ config = json.load(f)
377
+ if args.backend_type == "trt":
378
+ model = F5TTS(
379
+ config, debug_mode=False, tllm_model_dir=tllm_model_dir, model_path=args.model_path, vocab_size=vocab_size
380
+ )
381
+ elif args.backend_type == "pytorch":
382
+ import sys
383
+
384
+ sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../../../src/")
385
+ from f5_tts.infer.utils_infer import load_model
386
+ from f5_tts.model import DiT
387
+
388
+ F5TTS_model_cfg = dict(
389
+ dim=1024,
390
+ depth=22,
391
+ heads=16,
392
+ ff_mult=2,
393
+ text_dim=512,
394
+ conv_layers=4,
395
+ pe_attn_head=1,
396
+ text_mask_padding=False,
397
+ )
398
+ model = load_model(DiT, F5TTS_model_cfg, args.model_path)
399
+
400
+ vocoder = load_vocoder(
401
+ vocoder_name=args.vocoder, device=device, vocoder_trt_engine_path=args.vocoder_trt_engine_path
402
+ )
403
+
404
+ dataset = load_dataset(
405
+ "yuekai/seed_tts",
406
+ split=args.split_name,
407
+ trust_remote_code=True,
408
+ )
409
+
410
+ def add_estimated_duration(example):
411
+ prompt_audio_len = example["prompt_audio"]["array"].shape[0]
412
+ scale_factor = 1 + len(example["target_text"]) / len(example["prompt_text"])
413
+ estimated_duration = prompt_audio_len * scale_factor
414
+ example["estimated_duration"] = estimated_duration / example["prompt_audio"]["sampling_rate"]
415
+ return example
416
+
417
+ dataset = dataset.map(add_estimated_duration)
418
+ dataset = dataset.sort("estimated_duration", reverse=True)
419
+ if args.use_perf:
420
+ # dataset_list = [dataset.select(range(1)) for i in range(16)] # seq_len 1000
421
+ dataset_list_short = [dataset.select([24]) for i in range(8)] # seq_len 719
422
+ # dataset_list_long = [dataset.select([23]) for i in range(8)] # seq_len 2002
423
+ # dataset = datasets.concatenate_datasets(dataset_list_short + dataset_list_long)
424
+ dataset = datasets.concatenate_datasets(dataset_list_short)
425
+ if world_size > 1:
426
+ sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
427
+ else:
428
+ # This would disable shuffling
429
+ sampler = None
430
+
431
+ dataloader = DataLoader(
432
+ dataset,
433
+ batch_size=args.batch_size,
434
+ sampler=sampler,
435
+ shuffle=False,
436
+ num_workers=args.num_workers,
437
+ prefetch_factor=args.prefetch,
438
+ collate_fn=lambda x: data_collator(x, vocab_char_map, use_perf=args.use_perf),
439
+ )
440
+
441
+ total_steps = len(dataset)
442
+
443
+ if args.enable_warmup:
444
+ for batch in dataloader:
445
+ ref_mels, ref_mel_lens = batch["ref_mel_batch"].to(device), batch["ref_mel_len_batch"].to(device)
446
+ text_pad_seq = batch["text_pad_sequence"].to(device)
447
+ total_mel_lens = batch["estimated_reference_target_mel_len"]
448
+ if args.backend_type == "trt":
449
+ _ = model.sample(
450
+ text_pad_seq, ref_mels, ref_mel_lens, total_mel_lens, remove_input_padding=args.remove_input_padding
451
+ )
452
+ elif args.backend_type == "pytorch":
453
+ with torch.inference_mode():
454
+ text_pad_seq -= 1
455
+ text_pad_seq[text_pad_seq == -2] = -1
456
+ total_mel_lens = torch.tensor(total_mel_lens, device=device)
457
+ generated, _ = model.sample(
458
+ cond=ref_mels,
459
+ text=text_pad_seq,
460
+ duration=total_mel_lens,
461
+ steps=16,
462
+ cfg_strength=2.0,
463
+ sway_sampling_coef=-1,
464
+ )
465
+
466
+ if rank == 0:
467
+ progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")
468
+
469
+ decoding_time = 0
470
+ vocoder_time = 0
471
+ total_duration = 0
472
+ if args.use_perf:
473
+ torch.cuda.cudart().cudaProfilerStart()
474
+ total_decoding_time = time.time()
475
+ for batch in dataloader:
476
+ if args.use_perf:
477
+ torch.cuda.nvtx.range_push("data sample")
478
+ ref_mels, ref_mel_lens = batch["ref_mel_batch"].to(device), batch["ref_mel_len_batch"].to(device)
479
+ text_pad_seq = batch["text_pad_sequence"].to(device)
480
+ total_mel_lens = batch["estimated_reference_target_mel_len"]
481
+
482
+ if args.use_perf:
483
+ torch.cuda.nvtx.range_pop()
484
+ if args.backend_type == "trt":
485
+ generated, cost_time = model.sample(
486
+ text_pad_seq,
487
+ ref_mels,
488
+ ref_mel_lens,
489
+ total_mel_lens,
490
+ remove_input_padding=args.remove_input_padding,
491
+ use_perf=args.use_perf,
492
+ )
493
+ elif args.backend_type == "pytorch":
494
+ total_mel_lens = torch.tensor(total_mel_lens, device=device)
495
+ with torch.inference_mode():
496
+ start_time = time.time()
497
+ text_pad_seq -= 1
498
+ text_pad_seq[text_pad_seq == -2] = -1
499
+ generated, _ = model.sample(
500
+ cond=ref_mels,
501
+ text=text_pad_seq,
502
+ duration=total_mel_lens,
503
+ lens=ref_mel_lens,
504
+ steps=16,
505
+ cfg_strength=2.0,
506
+ sway_sampling_coef=-1,
507
+ )
508
+ cost_time = time.time() - start_time
509
+ decoding_time += cost_time
510
+ vocoder_start_time = time.time()
511
+ for i, gen in enumerate(generated):
512
+ gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
513
+ gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
514
+ if args.vocoder == "vocos":
515
+ if args.use_perf:
516
+ torch.cuda.nvtx.range_push("vocoder decode")
517
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
518
+ if args.use_perf:
519
+ torch.cuda.nvtx.range_pop()
520
+ else:
521
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
522
+ target_rms = 0.1
523
+ target_sample_rate = 24_000
524
+ # if ref_rms_list[i] < target_rms:
525
+ # generated_wave = generated_wave * ref_rms_list[i] / target_rms
526
+ rms = torch.sqrt(torch.mean(torch.square(generated_wave)))
527
+ if rms < target_rms:
528
+ generated_wave = generated_wave * target_rms / rms
529
+ utt = batch["ids"][i]
530
+ torchaudio.save(
531
+ f"{args.output_dir}/{utt}.wav",
532
+ generated_wave,
533
+ target_sample_rate,
534
+ )
535
+ total_duration += generated_wave.shape[1] / target_sample_rate
536
+ vocoder_time += time.time() - vocoder_start_time
537
+ if rank == 0:
538
+ progress_bar.update(world_size * len(batch["ids"]))
539
+ total_decoding_time = time.time() - total_decoding_time
540
+ if rank == 0:
541
+ progress_bar.close()
542
+ rtf = total_decoding_time / total_duration
543
+ s = f"RTF: {rtf:.4f}\n"
544
+ s += f"total_duration: {total_duration:.3f} seconds\n"
545
+ s += f"({total_duration / 3600:.2f} hours)\n"
546
+ s += f"DiT time: {decoding_time:.3f} seconds ({decoding_time / 3600:.2f} hours)\n"
547
+ s += f"Vocoder time: {vocoder_time:.3f} seconds ({vocoder_time / 3600:.2f} hours)\n"
548
+ s += f"total decoding time: {total_decoding_time:.3f} seconds ({total_decoding_time / 3600:.2f} hours)\n"
549
+ s += f"batch size: {args.batch_size}\n"
550
+ print(s)
551
+
552
+ with open(f"{args.output_dir}/rtf.txt", "w") as f:
553
+ f.write(s)
554
+
555
+ dist.barrier()
556
+ dist.destroy_process_group()
557
+
558
+
559
+ if __name__ == "__main__":
560
+ main()
F5-TTS/src/f5_tts/runtime/triton_trtllm/client_grpc.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
3
+ # 2023 Nvidia (authors: Yuekai Zhang)
4
+ # 2023 Recurrent.ai (authors: Songtao Shi)
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ """
19
+ This script supports to load dataset from huggingface and sends it to the server
20
+ for decoding, in parallel.
21
+
22
+ Usage:
23
+ num_task=2
24
+
25
+ # For offline F5-TTS
26
+ python3 client_grpc.py \
27
+ --server-addr localhost \
28
+ --model-name f5_tts \
29
+ --num-tasks $num_task \
30
+ --huggingface-dataset yuekai/seed_tts \
31
+ --split-name test_zh \
32
+ --log-dir ./log_concurrent_tasks_${num_task}
33
+
34
+ # For offline Spark-TTS-0.5B
35
+ python3 client_grpc.py \
36
+ --server-addr localhost \
37
+ --model-name spark_tts \
38
+ --num-tasks $num_task \
39
+ --huggingface-dataset yuekai/seed_tts \
40
+ --split-name wenetspeech4tts \
41
+ --log-dir ./log_concurrent_tasks_${num_task}
42
+ """
43
+
44
+ import argparse
45
+ import asyncio
46
+ import json
47
+ import os
48
+ import time
49
+ import types
50
+ from pathlib import Path
51
+
52
+ import numpy as np
53
+ import soundfile as sf
54
+ import tritonclient
55
+ import tritonclient.grpc.aio as grpcclient
56
+ from tritonclient.utils import np_to_triton_dtype
57
+
58
+
59
+ def write_triton_stats(stats, summary_file):
60
+ with open(summary_file, "w") as summary_f:
61
+ model_stats = stats["model_stats"]
62
+ # write a note, the log is from triton_client.get_inference_statistics(), to better human readability
63
+ summary_f.write(
64
+ "The log is parsing from triton_client.get_inference_statistics(), to better human readability. \n"
65
+ )
66
+ summary_f.write("To learn more about the log, please refer to: \n")
67
+ summary_f.write("1. https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md \n")
68
+ summary_f.write("2. https://github.com/triton-inference-server/server/issues/5374 \n\n")
69
+ summary_f.write(
70
+ "To better improve throughput, we always would like let requests wait in the queue for a while, and then execute them with a larger batch size. \n"
71
+ )
72
+ summary_f.write(
73
+ "However, there is a trade-off between the increased queue time and the increased batch size. \n"
74
+ )
75
+ summary_f.write(
76
+ "You may change 'max_queue_delay_microseconds' and 'preferred_batch_size' in the model configuration file to achieve this. \n"
77
+ )
78
+ summary_f.write(
79
+ "See https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching for more details. \n\n"
80
+ )
81
+ for model_state in model_stats:
82
+ if "last_inference" not in model_state:
83
+ continue
84
+ summary_f.write(f"model name is {model_state['name']} \n")
85
+ model_inference_stats = model_state["inference_stats"]
86
+ total_queue_time_s = int(model_inference_stats["queue"]["ns"]) / 1e9
87
+ total_infer_time_s = int(model_inference_stats["compute_infer"]["ns"]) / 1e9
88
+ total_input_time_s = int(model_inference_stats["compute_input"]["ns"]) / 1e9
89
+ total_output_time_s = int(model_inference_stats["compute_output"]["ns"]) / 1e9
90
+ summary_f.write(
91
+ f"queue time {total_queue_time_s:<5.2f} s, compute infer time {total_infer_time_s:<5.2f} s, compute input time {total_input_time_s:<5.2f} s, compute output time {total_output_time_s:<5.2f} s \n" # noqa
92
+ )
93
+ model_batch_stats = model_state["batch_stats"]
94
+ for batch in model_batch_stats:
95
+ batch_size = int(batch["batch_size"])
96
+ compute_input = batch["compute_input"]
97
+ compute_output = batch["compute_output"]
98
+ compute_infer = batch["compute_infer"]
99
+ batch_count = int(compute_infer["count"])
100
+ assert compute_infer["count"] == compute_output["count"] == compute_input["count"]
101
+ compute_infer_time_ms = int(compute_infer["ns"]) / 1e6
102
+ compute_input_time_ms = int(compute_input["ns"]) / 1e6
103
+ compute_output_time_ms = int(compute_output["ns"]) / 1e6
104
+ summary_f.write(
105
+ f"execuate inference with batch_size {batch_size:<2} total {batch_count:<5} times, total_infer_time {compute_infer_time_ms:<9.2f} ms, avg_infer_time {compute_infer_time_ms:<9.2f}/{batch_count:<5}={compute_infer_time_ms / batch_count:.2f} ms, avg_infer_time_per_sample {compute_infer_time_ms:<9.2f}/{batch_count:<5}/{batch_size}={compute_infer_time_ms / batch_count / batch_size:.2f} ms \n" # noqa
106
+ )
107
+ summary_f.write(
108
+ f"input {compute_input_time_ms:<9.2f} ms, avg {compute_input_time_ms / batch_count:.2f} ms, " # noqa
109
+ )
110
+ summary_f.write(
111
+ f"output {compute_output_time_ms:<9.2f} ms, avg {compute_output_time_ms / batch_count:.2f} ms \n" # noqa
112
+ )
113
+
114
+
115
+ def get_args():
116
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
117
+
118
+ parser.add_argument(
119
+ "--server-addr",
120
+ type=str,
121
+ default="localhost",
122
+ help="Address of the server",
123
+ )
124
+
125
+ parser.add_argument(
126
+ "--server-port",
127
+ type=int,
128
+ default=8001,
129
+ help="Grpc port of the triton server, default is 8001",
130
+ )
131
+
132
+ parser.add_argument(
133
+ "--reference-audio",
134
+ type=str,
135
+ default=None,
136
+ help="Path to a single audio file. It can't be specified at the same time with --manifest-dir",
137
+ )
138
+
139
+ parser.add_argument(
140
+ "--reference-text",
141
+ type=str,
142
+ default="",
143
+ help="",
144
+ )
145
+
146
+ parser.add_argument(
147
+ "--target-text",
148
+ type=str,
149
+ default="",
150
+ help="",
151
+ )
152
+
153
+ parser.add_argument(
154
+ "--huggingface-dataset",
155
+ type=str,
156
+ default="yuekai/seed_tts",
157
+ help="dataset name in huggingface dataset hub",
158
+ )
159
+
160
+ parser.add_argument(
161
+ "--split-name",
162
+ type=str,
163
+ default="wenetspeech4tts",
164
+ choices=["wenetspeech4tts", "test_zh", "test_en", "test_hard"],
165
+ help="dataset split name, default is 'test'",
166
+ )
167
+
168
+ parser.add_argument(
169
+ "--manifest-path",
170
+ type=str,
171
+ default=None,
172
+ help="Path to the manifest dir which includes wav.scp trans.txt files.",
173
+ )
174
+
175
+ parser.add_argument(
176
+ "--model-name",
177
+ type=str,
178
+ default="f5_tts",
179
+ choices=["f5_tts", "spark_tts"],
180
+ help="triton model_repo module name to request: transducer for k2, attention_rescoring for wenet offline, streaming_wenet for wenet streaming, infer_pipeline for paraformer large offline",
181
+ )
182
+
183
+ parser.add_argument(
184
+ "--num-tasks",
185
+ type=int,
186
+ default=1,
187
+ help="Number of concurrent tasks for sending",
188
+ )
189
+
190
+ parser.add_argument(
191
+ "--log-interval",
192
+ type=int,
193
+ default=5,
194
+ help="Controls how frequently we print the log.",
195
+ )
196
+
197
+ parser.add_argument(
198
+ "--compute-wer",
199
+ action="store_true",
200
+ default=False,
201
+ help="""True to compute WER.
202
+ """,
203
+ )
204
+
205
+ parser.add_argument(
206
+ "--log-dir",
207
+ type=str,
208
+ required=False,
209
+ default="./tmp",
210
+ help="log directory",
211
+ )
212
+
213
+ parser.add_argument(
214
+ "--batch-size",
215
+ type=int,
216
+ default=1,
217
+ help="Inference batch_size per request for offline mode.",
218
+ )
219
+
220
+ return parser.parse_args()
221
+
222
+
223
+ def load_audio(wav_path, target_sample_rate=24000):
224
+ assert target_sample_rate == 24000, "hard coding in server"
225
+ if isinstance(wav_path, dict):
226
+ waveform = wav_path["array"]
227
+ sample_rate = wav_path["sampling_rate"]
228
+ else:
229
+ waveform, sample_rate = sf.read(wav_path)
230
+ if sample_rate != target_sample_rate:
231
+ from scipy.signal import resample
232
+
233
+ num_samples = int(len(waveform) * (target_sample_rate / sample_rate))
234
+ waveform = resample(waveform, num_samples)
235
+ return waveform, target_sample_rate
236
+
237
+
238
+ async def send(
239
+ manifest_item_list: list,
240
+ name: str,
241
+ triton_client: tritonclient.grpc.aio.InferenceServerClient,
242
+ protocol_client: types.ModuleType,
243
+ log_interval: int,
244
+ model_name: str,
245
+ padding_duration: int = None,
246
+ audio_save_dir: str = "./",
247
+ save_sample_rate: int = 24000,
248
+ ):
249
+ total_duration = 0.0
250
+ latency_data = []
251
+ task_id = int(name[5:])
252
+
253
+ print(f"manifest_item_list: {manifest_item_list}")
254
+ for i, item in enumerate(manifest_item_list):
255
+ if i % log_interval == 0:
256
+ print(f"{name}: {i}/{len(manifest_item_list)}")
257
+ waveform, sample_rate = load_audio(item["audio_filepath"], target_sample_rate=24000)
258
+ duration = len(waveform) / sample_rate
259
+ lengths = np.array([[len(waveform)]], dtype=np.int32)
260
+
261
+ reference_text, target_text = item["reference_text"], item["target_text"]
262
+
263
+ estimated_target_duration = duration / len(reference_text) * len(target_text)
264
+
265
+ if padding_duration:
266
+ # padding to nearset 10 seconds
267
+ samples = np.zeros(
268
+ (
269
+ 1,
270
+ padding_duration
271
+ * sample_rate
272
+ * ((int(estimated_target_duration + duration) // padding_duration) + 1),
273
+ ),
274
+ dtype=np.float32,
275
+ )
276
+
277
+ samples[0, : len(waveform)] = waveform
278
+ else:
279
+ samples = waveform
280
+
281
+ samples = samples.reshape(1, -1).astype(np.float32)
282
+
283
+ inputs = [
284
+ protocol_client.InferInput("reference_wav", samples.shape, np_to_triton_dtype(samples.dtype)),
285
+ protocol_client.InferInput("reference_wav_len", lengths.shape, np_to_triton_dtype(lengths.dtype)),
286
+ protocol_client.InferInput("reference_text", [1, 1], "BYTES"),
287
+ protocol_client.InferInput("target_text", [1, 1], "BYTES"),
288
+ ]
289
+ inputs[0].set_data_from_numpy(samples)
290
+ inputs[1].set_data_from_numpy(lengths)
291
+
292
+ input_data_numpy = np.array([reference_text], dtype=object)
293
+ input_data_numpy = input_data_numpy.reshape((1, 1))
294
+ inputs[2].set_data_from_numpy(input_data_numpy)
295
+
296
+ input_data_numpy = np.array([target_text], dtype=object)
297
+ input_data_numpy = input_data_numpy.reshape((1, 1))
298
+ inputs[3].set_data_from_numpy(input_data_numpy)
299
+
300
+ outputs = [protocol_client.InferRequestedOutput("waveform")]
301
+
302
+ sequence_id = 100000000 + i + task_id * 10
303
+ start = time.time()
304
+ response = await triton_client.infer(model_name, inputs, request_id=str(sequence_id), outputs=outputs)
305
+
306
+ audio = response.as_numpy("waveform").reshape(-1)
307
+
308
+ end = time.time() - start
309
+
310
+ audio_save_path = os.path.join(audio_save_dir, f"{item['target_audio_path']}.wav")
311
+ sf.write(audio_save_path, audio, save_sample_rate, "PCM_16")
312
+
313
+ actual_duration = len(audio) / save_sample_rate
314
+ latency_data.append((end, actual_duration))
315
+ total_duration += actual_duration
316
+
317
+ return total_duration, latency_data
318
+
319
+
320
+ def load_manifests(manifest_path):
321
+ with open(manifest_path, "r") as f:
322
+ manifest_list = []
323
+ for line in f:
324
+ assert len(line.strip().split("|")) == 4
325
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
326
+ utt = Path(utt).stem
327
+ # gt_wav = os.path.join(os.path.dirname(manifest_path), "wavs", utt + ".wav")
328
+ if not os.path.isabs(prompt_wav):
329
+ prompt_wav = os.path.join(os.path.dirname(manifest_path), prompt_wav)
330
+ manifest_list.append(
331
+ {
332
+ "audio_filepath": prompt_wav,
333
+ "reference_text": prompt_text,
334
+ "target_text": gt_text,
335
+ "target_audio_path": utt,
336
+ }
337
+ )
338
+ return manifest_list
339
+
340
+
341
+ def split_data(data, k):
342
+ n = len(data)
343
+ if n < k:
344
+ print(f"Warning: the length of the input list ({n}) is less than k ({k}). Setting k to {n}.")
345
+ k = n
346
+
347
+ quotient = n // k
348
+ remainder = n % k
349
+
350
+ result = []
351
+ start = 0
352
+ for i in range(k):
353
+ if i < remainder:
354
+ end = start + quotient + 1
355
+ else:
356
+ end = start + quotient
357
+
358
+ result.append(data[start:end])
359
+ start = end
360
+
361
+ return result
362
+
363
+
364
+ async def main():
365
+ args = get_args()
366
+ url = f"{args.server_addr}:{args.server_port}"
367
+
368
+ triton_client = grpcclient.InferenceServerClient(url=url, verbose=False)
369
+ protocol_client = grpcclient
370
+
371
+ if args.reference_audio:
372
+ args.num_tasks = 1
373
+ args.log_interval = 1
374
+ manifest_item_list = [
375
+ {
376
+ "reference_text": args.reference_text,
377
+ "target_text": args.target_text,
378
+ "audio_filepath": args.reference_audio,
379
+ "target_audio_path": "test",
380
+ }
381
+ ]
382
+ elif args.huggingface_dataset:
383
+ import datasets
384
+
385
+ dataset = datasets.load_dataset(
386
+ args.huggingface_dataset,
387
+ split=args.split_name,
388
+ trust_remote_code=True,
389
+ )
390
+ manifest_item_list = []
391
+ for i in range(len(dataset)):
392
+ manifest_item_list.append(
393
+ {
394
+ "audio_filepath": dataset[i]["prompt_audio"],
395
+ "reference_text": dataset[i]["prompt_text"],
396
+ "target_audio_path": dataset[i]["id"],
397
+ "target_text": dataset[i]["target_text"],
398
+ }
399
+ )
400
+ else:
401
+ manifest_item_list = load_manifests(args.manifest_path)
402
+
403
+ args.num_tasks = min(args.num_tasks, len(manifest_item_list))
404
+ manifest_item_list = split_data(manifest_item_list, args.num_tasks)
405
+
406
+ os.makedirs(args.log_dir, exist_ok=True)
407
+ tasks = []
408
+ start_time = time.time()
409
+ for i in range(args.num_tasks):
410
+ task = asyncio.create_task(
411
+ send(
412
+ manifest_item_list[i],
413
+ name=f"task-{i}",
414
+ triton_client=triton_client,
415
+ protocol_client=protocol_client,
416
+ log_interval=args.log_interval,
417
+ model_name=args.model_name,
418
+ audio_save_dir=args.log_dir,
419
+ padding_duration=1,
420
+ save_sample_rate=24000,
421
+ )
422
+ )
423
+ tasks.append(task)
424
+
425
+ ans_list = await asyncio.gather(*tasks)
426
+
427
+ end_time = time.time()
428
+ elapsed = end_time - start_time
429
+
430
+ total_duration = 0.0
431
+ latency_data = []
432
+ for ans in ans_list:
433
+ total_duration += ans[0]
434
+ latency_data += ans[1]
435
+
436
+ rtf = elapsed / total_duration
437
+
438
+ s = f"RTF: {rtf:.4f}\n"
439
+ s += f"total_duration: {total_duration:.3f} seconds\n"
440
+ s += f"({total_duration / 3600:.2f} hours)\n"
441
+ s += f"processing time: {elapsed:.3f} seconds ({elapsed / 3600:.2f} hours)\n"
442
+
443
+ latency_list = [chunk_end for (chunk_end, chunk_duration) in latency_data]
444
+ latency_ms = sum(latency_list) / float(len(latency_list)) * 1000.0
445
+ latency_variance = np.var(latency_list, dtype=np.float64) * 1000.0
446
+ s += f"latency_variance: {latency_variance:.2f}\n"
447
+ s += f"latency_50_percentile_ms: {np.percentile(latency_list, 50) * 1000.0:.2f}\n"
448
+ s += f"latency_90_percentile_ms: {np.percentile(latency_list, 90) * 1000.0:.2f}\n"
449
+ s += f"latency_95_percentile_ms: {np.percentile(latency_list, 95) * 1000.0:.2f}\n"
450
+ s += f"latency_99_percentile_ms: {np.percentile(latency_list, 99) * 1000.0:.2f}\n"
451
+ s += f"average_latency_ms: {latency_ms:.2f}\n"
452
+
453
+ print(s)
454
+ if args.manifest_path:
455
+ name = Path(args.manifest_path).stem
456
+ elif args.split_name:
457
+ name = args.split_name
458
+ with open(f"{args.log_dir}/rtf-{name}.txt", "w") as f:
459
+ f.write(s)
460
+
461
+ stats = await triton_client.get_inference_statistics(model_name="", as_json=True)
462
+ write_triton_stats(stats, f"{args.log_dir}/stats_summary-{name}.txt")
463
+
464
+ metadata = await triton_client.get_model_config(model_name=args.model_name, as_json=True)
465
+ with open(f"{args.log_dir}/model_config-{name}.json", "w") as f:
466
+ json.dump(metadata, f, indent=4)
467
+
468
+
469
+ if __name__ == "__main__":
470
+ asyncio.run(main())
F5-TTS/src/f5_tts/runtime/triton_trtllm/client_http.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ import argparse
27
+
28
+ import numpy as np
29
+ import requests
30
+ import soundfile as sf
31
+
32
+
33
+ def get_args():
34
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
35
+
36
+ parser.add_argument(
37
+ "--server-url",
38
+ type=str,
39
+ default="localhost:8000",
40
+ help="Address of the server",
41
+ )
42
+
43
+ parser.add_argument(
44
+ "--reference-audio",
45
+ type=str,
46
+ default="../../infer/examples/basic/basic_ref_en.wav",
47
+ help="Path to a single audio file. It can't be specified at the same time with --manifest-dir",
48
+ )
49
+
50
+ parser.add_argument(
51
+ "--reference-text",
52
+ type=str,
53
+ default="Some call me nature, others call me mother nature.",
54
+ help="",
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--target-text",
59
+ type=str,
60
+ default="I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
61
+ help="",
62
+ )
63
+
64
+ parser.add_argument(
65
+ "--model-name",
66
+ type=str,
67
+ default="f5_tts",
68
+ choices=["f5_tts", "spark_tts"],
69
+ help="triton model_repo module name to request",
70
+ )
71
+
72
+ parser.add_argument(
73
+ "--output-audio",
74
+ type=str,
75
+ default="output.wav",
76
+ help="Path to save the output audio",
77
+ )
78
+ return parser.parse_args()
79
+
80
+
81
+ def prepare_request(
82
+ samples,
83
+ reference_text,
84
+ target_text,
85
+ sample_rate=24000,
86
+ audio_save_dir: str = "./",
87
+ ):
88
+ assert len(samples.shape) == 1, "samples should be 1D"
89
+ lengths = np.array([[len(samples)]], dtype=np.int32)
90
+ samples = samples.reshape(1, -1).astype(np.float32)
91
+
92
+ data = {
93
+ "inputs": [
94
+ {"name": "reference_wav", "shape": samples.shape, "datatype": "FP32", "data": samples.tolist()},
95
+ {
96
+ "name": "reference_wav_len",
97
+ "shape": lengths.shape,
98
+ "datatype": "INT32",
99
+ "data": lengths.tolist(),
100
+ },
101
+ {"name": "reference_text", "shape": [1, 1], "datatype": "BYTES", "data": [reference_text]},
102
+ {"name": "target_text", "shape": [1, 1], "datatype": "BYTES", "data": [target_text]},
103
+ ]
104
+ }
105
+
106
+ return data
107
+
108
+
109
+ def load_audio(wav_path, target_sample_rate=24000):
110
+ assert target_sample_rate == 24000, "hard coding in server"
111
+ if isinstance(wav_path, dict):
112
+ samples = wav_path["array"]
113
+ sample_rate = wav_path["sampling_rate"]
114
+ else:
115
+ samples, sample_rate = sf.read(wav_path)
116
+ if sample_rate != target_sample_rate:
117
+ from scipy.signal import resample
118
+
119
+ num_samples = int(len(samples) * (target_sample_rate / sample_rate))
120
+ samples = resample(samples, num_samples)
121
+ return samples, target_sample_rate
122
+
123
+
124
+ if __name__ == "__main__":
125
+ args = get_args()
126
+ server_url = args.server_url
127
+ if not server_url.startswith(("http://", "https://")):
128
+ server_url = f"http://{server_url}"
129
+
130
+ url = f"{server_url}/v2/models/{args.model_name}/infer"
131
+ samples, sr = load_audio(args.reference_audio)
132
+ assert sr == 24000, "sample rate hardcoded in server"
133
+
134
+ samples = np.array(samples, dtype=np.float32)
135
+ data = prepare_request(samples, args.reference_text, args.target_text)
136
+
137
+ rsp = requests.post(
138
+ url, headers={"Content-Type": "application/json"}, json=data, verify=False, params={"request_id": "0"}
139
+ )
140
+ result = rsp.json()
141
+ audio = result["outputs"][0]["data"]
142
+ audio = np.array(audio, dtype=np.float32)
143
+ sf.write(args.output_audio, audio, 24000, "PCM_16")
F5-TTS/src/f5_tts/runtime/triton_trtllm/docker-compose.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ tts:
3
+ image: soar97/triton-f5-tts:24.12
4
+ shm_size: '1gb'
5
+ ports:
6
+ - "8000:8000"
7
+ - "8001:8001"
8
+ - "8002:8002"
9
+ environment:
10
+ - PYTHONIOENCODING=utf-8
11
+ - MODEL_ID=${MODEL_ID}
12
+ deploy:
13
+ resources:
14
+ reservations:
15
+ devices:
16
+ - driver: nvidia
17
+ device_ids: ['0']
18
+ capabilities: [gpu]
19
+ command: >
20
+ /bin/bash -c "pip install vocos && rm -rf F5-TTS && git clone https://github.com/SWivid/F5-TTS.git && cd F5-TTS/src/f5_tts/runtime/triton_trtllm/ && bash run.sh 0 4 $MODEL"