aifeifei798 commited on
Commit
7f42fd2
·
verified ·
1 Parent(s): 23a9924

Upload 67 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +202 -0
  2. README.md +236 -4
  3. apg_guidance.py +95 -0
  4. app.py +45 -0
  5. config/zh_rap_lora_config.json +15 -0
  6. data_sampler.py +30 -0
  7. examples/default/input_params/output_20250426071706_0_input_params.json +25 -0
  8. examples/default/input_params/output_20250426071812_0_input_params.json +25 -0
  9. examples/default/input_params/output_20250426072346_0_input_params.json +25 -0
  10. examples/default/input_params/output_20250426072508_0_input_params.json +25 -0
  11. examples/default/input_params/output_20250426073829_0_input_params.json +25 -0
  12. examples/default/input_params/output_20250426074037_0_input_params.json +25 -0
  13. examples/default/input_params/output_20250426074214_0_input_params.json +25 -0
  14. examples/default/input_params/output_20250426074413_0_input_params.json +25 -0
  15. examples/default/input_params/output_20250426075107_0_input_params.json +25 -0
  16. examples/default/input_params/output_20250426075537_0_input_params.json +25 -0
  17. examples/default/input_params/output_20250426075843_0_input_params.json +25 -0
  18. examples/default/input_params/output_20250426080234_0_input_params.json +25 -0
  19. examples/default/input_params/output_20250426080407_0_input_params.json +25 -0
  20. examples/default/input_params/output_20250426080601_0_input_params.json +25 -0
  21. examples/default/input_params/output_20250426081134_0_input_params.json +25 -0
  22. examples/default/input_params/output_20250426091716_0_input_params.json +25 -0
  23. examples/default/input_params/output_20250426092025_0_input_params.json +25 -0
  24. examples/default/input_params/output_20250426093007_0_input_params.json +25 -0
  25. examples/default/input_params/output_20250426093146_0_input_params.json +25 -0
  26. examples/zh_rap_lora/input_params/output_20250512101839_0_input_params.json +45 -0
  27. examples/zh_rap_lora/input_params/output_20250512114703_0_input_params.json +45 -0
  28. examples/zh_rap_lora/input_params/output_20250512115409_0_input_params.json +45 -0
  29. examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json +45 -0
  30. examples/zh_rap_lora/input_params/output_20250512143242_0_input_params.json +45 -0
  31. examples/zh_rap_lora/input_params/output_20250512145057_0_input_params.json +45 -0
  32. examples/zh_rap_lora/input_params/output_20250512152217_0_input_params.json +45 -0
  33. examples/zh_rap_lora/input_params/output_20250512153616_0_input_params.json +45 -0
  34. examples/zh_rap_lora/input_params/output_20250512154907_0_input_params.json +45 -0
  35. examples/zh_rap_lora/input_params/output_20250512161832_0_input_params.json +45 -0
  36. examples/zh_rap_lora/input_params/output_20250512164224_0_input_params.json +45 -0
  37. examples/zh_rap_lora/input_params/output_20250512171227_0_input_params.json +45 -0
  38. examples/zh_rap_lora/input_params/output_20250512171809_0_input_params.json +45 -0
  39. examples/zh_rap_lora/input_params/output_20250512172941_0_input_params.json +45 -0
  40. examples/zh_rap_lora/input_params/output_20250513044511_0_input_params.json +45 -0
  41. examples/zh_rap_lora/input_params/output_20250513050200_0_input_params.json +45 -0
  42. examples/zh_rap_lora/input_params/output_20250513055451_0_input_params.json +45 -0
  43. examples/zh_rap_lora/input_params/output_20250513060150_0_input_params.json +45 -0
  44. language_segmentation/LangSegment.py +866 -0
  45. language_segmentation/__init__.py +9 -0
  46. language_segmentation/utils/__init__.py +0 -0
  47. language_segmentation/utils/num.py +327 -0
  48. models/ace_step_transformer.py +475 -0
  49. models/attention.py +319 -0
  50. models/config.json +23 -0
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [2025] Timedomain Inc. and stepfun
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1,245 @@
1
  ---
2
  title: ACE Step
3
  emoji: 😻
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.33.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: ACE Step
3
  emoji: 😻
4
+ colorFrom: blue
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 5.27.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ short_description: A Step Towards Music Generation Foundation Model
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ <h1 align="center">✨ ACE-Step ✨</h1>
17
+ <h1 align="center">🎵 A Step Towards Music Generation Foundation Model 🎵</h1>
18
+ <p align="center">
19
+ <a href="https://ace-step.github.io/">Project</a> |
20
+ <a href="https://github.com/ace-step/ACE-Step">Code</a> |
21
+ <a href="https://huggingface.co/ACE-Step/ACE-Step-v1-3.5B">Checkpoints</a> |
22
+ <a href="https://huggingface.co/spaces/ACE-Step/ACE-Step">Space Demo</a>
23
+ </p>
24
+
25
+ ---
26
+ <p align="center">
27
+ <img src="./fig/orgnization_logos.png" width="100%" alt="Org Logo">
28
+ </p>
29
+
30
+ ## Table of Contents
31
+
32
+ - [Features](#-features)
33
+ - [Installation](#-installation)
34
+ - [Usage](#-user-interface-guide)
35
+
36
+ ## 📢 News and Updates
37
+
38
+ - 🚀 2025.05.06: Open source demo code and model
39
+
40
+ ## TODOs📋
41
+ - [ ] 🔁 Release training code
42
+ - [ ] 🔄 Release LoRA training code & 🎤 RapMachine lora
43
+ - [ ] 🎮 Release ControlNet training code & 🎤 Singing2Accompaniment controlnet
44
+
45
+ ## 🏗️ Architecture
46
+
47
+ <p align="center">
48
+ <img src="./fig/ACE-Step_framework.png" width="100%" alt="ACE-Step Framework">
49
+ </p>
50
+
51
+
52
+ ## 📝 Abstract
53
+
54
+ We introduce ACE-Step, a novel open-source foundation model for music generation that overcomes key limitations of existing approaches and achieves state-of-the-art performance through a holistic architectural design. Current methods face inherent trade-offs between generation speed, musical coherence, and controllability. For instance, LLM-based models (e.g., Yue, SongGen) excel at lyric alignment but suffer from slow inference and structural artifacts. Diffusion models (e.g., DiffRhythm), on the other hand, enable faster synthesis but often lack long-range structural coherence.
55
+
56
+ ACE-Step bridges this gap by integrating diffusion-based generation with Sana’s Deep Compression AutoEncoder (DCAE) and a lightweight linear transformer. It further leverages MERT and m-hubert to align semantic representations (REPA) during training, enabling rapid convergence. As a result, our model synthesizes up to 4 minutes of music in just 20 seconds on an A100 GPU—15× faster than LLM-based baselines—while achieving superior musical coherence and lyric alignment across melody, harmony, and rhythm metrics. Moreover, ACE-Step preserves fine-grained acoustic details, enabling advanced control mechanisms such as voice cloning, lyric editing, remixing, and track generation (e.g., lyric2vocal, singing2accompaniment).
57
+
58
+ Rather than building yet another end-to-end text-to-music pipeline, our vision is to establish a foundation model for music AI: a fast, general-purpose, efficient yet flexible architecture that makes it easy to train sub-tasks on top of it. This paves the way for developing powerful tools that seamlessly integrate into the creative workflows of music artists, producers, and content creators. In short, we aim to build the Stable Diffusion moment for music.
59
+
60
+ ## ✨ Features
61
+
62
+ <p align="center">
63
+ <img src="./fig/application_map.png" width="100%" alt="ACE-Step Framework">
64
+ </p>
65
+
66
+ ### 🎯 Baseline Quality
67
+
68
+ #### 🌈 Diverse Styles & Genres
69
+ - 🎸 Supports all mainstream music styles with various description formats including short tags, descriptive text, or use-case scenarios
70
+ - 🎷 Capable of generating music across different genres with appropriate instrumentation and style
71
+
72
+ #### 🌍 Multiple Languages
73
+ - 🗣️ Supports 19 languages with top 10 well-performing languages including:
74
+ - 🇺🇸 English, 🇨🇳 Chinese, 🇷🇺 Russian, 🇪🇸 Spanish, 🇯🇵 Japanese, 🇩🇪 German, 🇫🇷 French, 🇵🇹 Portuguese, 🇮🇹 Italian, 🇰🇷 Korean
75
+ - ⚠️ Due to data imbalance, less common languages may underperform
76
+
77
+ #### 🎻 Instrumental Styles
78
+ - 🎹 Supports various instrumental music generation across different genres and styles
79
+ - 🎺 Capable of producing realistic instrumental tracks with appropriate timbre and expression for each instrument
80
+ - 🎼 Can generate complex arrangements with multiple instruments while maintaining musical coherence
81
+
82
+ #### 🎤 Vocal Techniques
83
+ - 🎙️ Capable of rendering various vocal styles and techniques with good quality
84
+ - 🗣️ Supports different vocal expressions including various singing techniques and styles
85
+
86
+ ### 🎛️ Controllability
87
+
88
+ #### 🔄 Variations Generation
89
+ - ⚙️ Implemented using training-free, inference-time optimization techniques
90
+ - 🌊 Flow-matching model generates initial noise, then uses trigFlow's noise formula to add additional Gaussian noise
91
+ - 🎚️ Adjustable mixing ratio between original initial noise and new Gaussian noise to control variation degree
92
+
93
+ #### 🎨 Repainting
94
+ - 🖌️ Implemented by adding noise to the target audio input and applying mask constraints during the ODE process
95
+ - 🔍 When input conditions change from the original generation, only specific aspects can be modified while preserving the rest
96
+ - 🔀 Can be combined with Variations Generation techniques to create localized variations in style, lyrics, or vocals
97
+
98
+ #### ✏️ Lyric Editing
99
+ - 💡 Innovatively applies flow-edit technology to enable localized lyric modifications while preserving melody, vocals, and accompaniment
100
+ - 🔄 Works with both generated content and uploaded audio, greatly enhancing creative possibilities
101
+ - ℹ️ Current limitation: can only modify small segments of lyrics at once to avoid distortion, but multiple edits can be applied sequentially
102
+
103
+ ### 🚀 Applications
104
+
105
+ #### 🎤 Lyric2Vocal (LoRA)
106
+ - 🔊 Based on a LoRA fine-tuned on pure vocal data, allowing direct generation of vocal samples from lyrics
107
+ - 🛠️ Offers numerous practical applications such as vocal demos, guide tracks, songwriting assistance, and vocal arrangement experimentation
108
+ - ⏱️ Provides a quick way to test how lyrics might sound when sung, helping songwriters iterate faster
109
+
110
+ #### 📝 Text2Samples (LoRA)
111
+ - 🎛️ Similar to Lyric2Vocal, but fine-tuned on pure instrumental and sample data
112
+ - 🎵 Capable of generating conceptual music production samples from text descriptions
113
+ - 🧰 Useful for quickly creating instrument loops, sound effects, and musical elements for production
114
+
115
+ ### 🔮 Coming Soon
116
+
117
+ #### 🎤 RapMachine
118
+ - 🔥 Fine-tuned on pure rap data to create an AI system specialized in rap generation
119
+ - 🏆 Expected capabilities include AI rap battles and narrative expression through rap
120
+ - 📚 Rap has exceptional storytelling and expressive capabilities, offering extraordinary application potential
121
+
122
+ #### 🎛️ StemGen
123
+ - 🎚️ A controlnet-lora trained on multi-track data to generate individual instrument stems
124
+ - 🎯 Takes a reference track and specified instrument (or instrument reference audio) as input
125
+ - 🎹 Outputs an instrument stem that complements the reference track, such as creating a piano accompaniment for a flute melody or adding jazz drums to a lead guitar
126
+
127
+ #### 🎤 Singing2Accompaniment
128
+ - 🔄 The reverse process of StemGen, generating a mixed master track from a single vocal track
129
+ - 🎵 Takes a vocal track and specified style as input to produce a complete vocal accompaniment
130
+ - 🎸 Creates full instrumental backing that complements the input vocals, making it easy to add professional-sounding accompaniment to any vocal recording
131
+
132
+ ## 💻 Installation
133
+
134
+ ```bash
135
+ conda create -n ace_step python==3.10
136
+ conda activate ace_step
137
+ pip install -r requirements.txt
138
+ conda install ffmpeg
139
+ ```
140
+
141
+ ## 🖥️ Hardware Performance
142
+
143
+ We've tested ACE-Step on various hardware configurations with the following throughput results:
144
+
145
+ | Device | 27 Steps | 60 Steps |
146
+ |--------|-------------------------|-------------------------|
147
+ | NVIDIA A100 | 0.036675| 0.0815 |
148
+ | MacBook M2 Max | | 0.44 | 0.97 |
149
+ | NVIDIA RTX 4090 | 0.029 | 0.064 |
150
+
151
+ seconds cost per generated audio (seconds/audio)
152
+ For example, to generate a 180-second song, multiply 180 by the seconds cost per generated audio (seconds/audio) for the desired device and step count. This will give you the total time required for the generation process.
153
+
154
+ ## 🚀 Usage
155
+
156
+ ![Demo Interface](fig/demo_interface.png)
157
+
158
+ ### 🔍 Basic Usage
159
+
160
+ ```bash
161
+ python app.py
162
+ ```
163
+
164
+ ### ⚙️ Advanced Usage
165
+
166
+ ```bash
167
+ python app.py --checkpoint_path /path/to/checkpoint --port 7865 --device_id 0 --share --bf16
168
+ ```
169
+
170
+ #### 🛠️ Command Line Arguments
171
+
172
+ - `--checkpoint_path`: Path to the model checkpoint (default: downloads automatically)
173
+ - `--port`: Port to run the Gradio server on (default: 7865)
174
+ - `--device_id`: GPU device ID to use (default: 0)
175
+ - `--share`: Enable Gradio sharing link (default: False)
176
+ - `--bf16`: Use bfloat16 precision for faster inference (default: True)
177
+
178
+ ## 📱 User Interface Guide
179
+
180
+ The ACE-Step interface provides several tabs for different music generation and editing tasks:
181
+
182
+ ### 📝 Text2Music Tab
183
+
184
+ 1. **📋 Input Fields**:
185
+ - **🏷️ Tags**: Enter descriptive tags, genres, or scene descriptions separated by commas
186
+ - **📜 Lyrics**: Enter lyrics with structure tags like [verse], [chorus], and [bridge]
187
+ - **⏱️ Audio Duration**: Set the desired duration of the generated audio (-1 for random)
188
+
189
+ 2. **⚙️ Settings**:
190
+ - **🔧 Basic Settings**: Adjust inference steps, guidance scale, and seeds
191
+ - **🔬 Advanced Settings**: Fine-tune scheduler type, CFG type, ERG settings, and more
192
+
193
+ 3. **🚀 Generation**: Click "Generate" to create music based on your inputs
194
+
195
+ ### 🔄 Retake Tab
196
+
197
+ - 🎲 Regenerate music with slight variations using different seeds
198
+ - 🎚️ Adjust variance to control how much the retake differs from the original
199
+
200
+ ### 🎨 Repainting Tab
201
+
202
+ - 🖌️ Selectively regenerate specific sections of the music
203
+ - ⏱️ Specify start and end times for the section to repaint
204
+ - 🔍 Choose the source audio (text2music output, last repaint, or upload)
205
+
206
+ ### ✏️ Edit Tab
207
+
208
+ - 🔄 Modify existing music by changing tags or lyrics
209
+ - 🎛️ Choose between "only_lyrics" mode (preserves melody) or "remix" mode (changes melody)
210
+ - 🎚️ Adjust edit parameters to control how much of the original is preserved
211
+
212
+ ### 📏 Extend Tab
213
+
214
+ - ➕ Add music to the beginning or end of an existing piece
215
+ - 📐 Specify left and right extension lengths
216
+ - 🔍 Choose the source audio to extend
217
+
218
+ ## Examples
219
+
220
+ The `examples/input_params` directory contains sample input parameters that can be used as references for generating music.
221
+
222
+ ## 📜 License&Disclaimer
223
+
224
+ This project is licensed under [Apache License 2.0](./LICENSE)
225
+
226
+ ACE-Step enables original music generation across diverse genres, with applications in creative production, education, and entertainment. While designed to support positive and artistic use cases, we acknowledge potential risks such as unintentional copyright infringement due to stylistic similarity, inappropriate blending of cultural elements, and misuse for generating harmful content. To ensure responsible use, we encourage users to verify the originality of generated works, clearly disclose AI involvement, and obtain appropriate permissions when adapting protected styles or materials. By using ACE-Step, you agree to uphold these principles and respect artistic integrity, cultural diversity, and legal compliance. The authors are not responsible for any misuse of the model, including but not limited to copyright violations, cultural insensitivity, or the generation of harmful content.
227
+
228
+ ## 🙏 Acknowledgements
229
+
230
+ This project is co-led by ACE Studio and StepFun.
231
+
232
+
233
+ ## 📖 Citation
234
+
235
+ If you find this project useful for your research, please consider citing:
236
+
237
+ ```BibTeX
238
+ @misc{gong2025acestep,
239
+ title={ACE-Step: A Step Towards Music Generation Foundation Model},
240
+ author={Junmin Gong, Wenxiao Zhao, Sen Wang, Shengyuan Xu, Jing Guo},
241
+ howpublished={\url{https://github.com/ace-step/ACE-Step}},
242
+ year={2025},
243
+ note={GitHub repository}
244
+ }
245
+ ```
apg_guidance.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class MomentumBuffer:
5
+ def __init__(self, momentum: float = -0.75):
6
+ self.momentum = momentum
7
+ self.running_average = 0
8
+
9
+ def update(self, update_value: torch.Tensor):
10
+ new_average = self.momentum * self.running_average
11
+ self.running_average = update_value + new_average
12
+
13
+
14
+ def project(
15
+ v0: torch.Tensor, # [B, C, H, W]
16
+ v1: torch.Tensor, # [B, C, H, W]
17
+ dims=[-1, -2],
18
+ ):
19
+ dtype = v0.dtype
20
+ device_type = v0.device.type
21
+ if device_type == "mps":
22
+ v0, v1 = v0.cpu(), v1.cpu()
23
+
24
+ v0, v1 = v0.double(), v1.double()
25
+ v1 = torch.nn.functional.normalize(v1, dim=dims)
26
+ v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
27
+ v0_orthogonal = v0 - v0_parallel
28
+ return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
29
+
30
+
31
+ def apg_forward(
32
+ pred_cond: torch.Tensor, # [B, C, H, W]
33
+ pred_uncond: torch.Tensor, # [B, C, H, W]
34
+ guidance_scale: float,
35
+ momentum_buffer: MomentumBuffer = None,
36
+ eta: float = 0.0,
37
+ norm_threshold: float = 2.5,
38
+ dims=[-1, -2],
39
+ ):
40
+ diff = pred_cond - pred_uncond
41
+ if momentum_buffer is not None:
42
+ momentum_buffer.update(diff)
43
+ diff = momentum_buffer.running_average
44
+
45
+ if norm_threshold > 0:
46
+ ones = torch.ones_like(diff)
47
+ diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
48
+ scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
49
+ diff = diff * scale_factor
50
+
51
+ diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
52
+ normalized_update = diff_orthogonal + eta * diff_parallel
53
+ pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
54
+ return pred_guided
55
+
56
+
57
+ def cfg_forward(cond_output, uncond_output, cfg_strength):
58
+ return uncond_output + cfg_strength * (cond_output - uncond_output)
59
+
60
+
61
+ def cfg_double_condition_forward(
62
+ cond_output,
63
+ uncond_output,
64
+ only_text_cond_output,
65
+ guidance_scale_text,
66
+ guidance_scale_lyric,
67
+ ):
68
+ return (1 - guidance_scale_text) * uncond_output + (guidance_scale_text - guidance_scale_lyric) * only_text_cond_output + guidance_scale_lyric * cond_output
69
+
70
+
71
+ def optimized_scale(positive_flat, negative_flat):
72
+
73
+ # Calculate dot production
74
+ dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
75
+
76
+ # Squared norm of uncondition
77
+ squared_norm = torch.sum(negative_flat ** 2, dim=1, keepdim=True) + 1e-8
78
+
79
+ # st_star = v_cond^T * v_uncond / ||v_uncond||^2
80
+ st_star = dot_product / squared_norm
81
+
82
+ return st_star
83
+
84
+
85
+ def cfg_zero_star(noise_pred_with_cond, noise_pred_uncond, guidance_scale, i, zero_steps=1, use_zero_init=True):
86
+ bsz = noise_pred_with_cond.shape[0]
87
+ positive_flat = noise_pred_with_cond.view(bsz, -1)
88
+ negative_flat = noise_pred_uncond.view(bsz, -1)
89
+ alpha = optimized_scale(positive_flat, negative_flat)
90
+ alpha = alpha.view(bsz, 1, 1, 1)
91
+ if (i <= zero_steps) and use_zero_init:
92
+ noise_pred = noise_pred_with_cond * 0.
93
+ else:
94
+ noise_pred = noise_pred_uncond * alpha + guidance_scale * (noise_pred_with_cond - noise_pred_uncond * alpha)
95
+ return noise_pred
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from ui.components import create_main_demo_ui
3
+ from pipeline_ace_step import ACEStepPipeline
4
+ from data_sampler import DataSampler
5
+ import os
6
+
7
+
8
+ parser = argparse.ArgumentParser()
9
+ parser.add_argument("--checkpoint_path", type=str, default=None)
10
+ parser.add_argument("--server_name", type=str, default="0.0.0.0")
11
+ parser.add_argument("--port", type=int, default=7860)
12
+ parser.add_argument("--device_id", type=int, default=0)
13
+ parser.add_argument("--share", action='store_true', default=False)
14
+ parser.add_argument("--bf16", action='store_true', default=True)
15
+ parser.add_argument("--torch_compile", type=bool, default=False)
16
+
17
+ args = parser.parse_args()
18
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device_id)
19
+
20
+
21
+ persistent_storage_path = "/data"
22
+
23
+
24
+ def main(args):
25
+
26
+ model_demo = ACEStepPipeline(
27
+ checkpoint_dir=args.checkpoint_path,
28
+ dtype="bfloat16" if args.bf16 else "float32",
29
+ persistent_storage_path=persistent_storage_path,
30
+ torch_compile=args.torch_compile
31
+ )
32
+ data_sampler = DataSampler()
33
+
34
+ demo = create_main_demo_ui(
35
+ text2music_process_func=model_demo.__call__,
36
+ sample_data_func=data_sampler.sample,
37
+ load_data_func=data_sampler.load_json,
38
+ )
39
+ demo.queue(default_concurrency_limit=8).launch(
40
+
41
+ )
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main(args)
config/zh_rap_lora_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "r": 256,
3
+ "lora_alpha": 32,
4
+ "target_modules": [
5
+ "speaker_embedder",
6
+ "linear_q",
7
+ "linear_k",
8
+ "linear_v",
9
+ "to_q",
10
+ "to_k",
11
+ "to_v",
12
+ "to_out.0"
13
+ ],
14
+ "use_rslora": true
15
+ }
data_sampler.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import random
4
+
5
+
6
+ DEFAULT_ROOT_DIR = "examples/default/input_params"
7
+ ZH_RAP_LORA_ROOT_DIR = "examples/zh_rap_lora/input_params"
8
+
9
+ class DataSampler:
10
+ def __init__(self, root_dir=DEFAULT_ROOT_DIR):
11
+ self.root_dir = root_dir
12
+ self.input_params_files = list(Path(self.root_dir).glob("*.json"))
13
+ self.zh_rap_lora_input_params_files = list(Path(ZH_RAP_LORA_ROOT_DIR).glob("*.json"))
14
+ self.zh_rap_lora_input_params_files += list(Path(ZH_RAP_LORA_ROOT_DIR).glob("*.json"))
15
+
16
+ def load_json(self, file_path):
17
+ with open(file_path, "r", encoding="utf-8") as f:
18
+ return json.load(f)
19
+
20
+ def sample(self, lora_name_or_path=None):
21
+ if lora_name_or_path is None or lora_name_or_path == "none":
22
+ json_path = random.choice(self.input_params_files)
23
+ json_data = self.load_json(json_path)
24
+ else:
25
+ json_path = random.choice(self.zh_rap_lora_input_params_files)
26
+ json_data = self.load_json(json_path)
27
+ # Update the lora_name in the json_data
28
+ json_data["lora_name_or_path"] = lora_name_or_path
29
+
30
+ return json_data
examples/default/input_params/output_20250426071706_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "pop, rap, electronic, blues, hip-house, rhythm and blues",
3
+ "lyrics": "[verse]\n我走过深夜的街道\n冷风吹乱思念的漂亮外套\n你的微笑像星光很炫耀\n照亮了我孤独的每分每秒\n\n[chorus]\n愿你是风吹过我的脸\n带我飞过最远最遥远的山间\n愿你是风轻触我的梦\n停在心头不再飘散无迹无踪\n\n[verse]\n一起在喧哗避开世俗的骚动\n独自在天台探望月色的朦胧\n你说爱像音乐带点重节奏\n一拍一跳让我忘了心的温度多空洞\n\n[bridge]\n唱起对你的想念不隐藏\n像诗又像画写满藏不了的渴望\n你的影子挥不掉像风的倔强\n追着你飞扬穿越云海一样泛光\n\n[chorus]\n愿你是风吹过我的手\n暖暖的触碰像春日细雨温柔\n愿你是风盘绕我的身\n深情万万重不会有一天走远走\n\n[verse]\n深夜的钢琴弹起动人的旋律\n低音鼓砸进心底的每一次呼吸\n要是能将爱化作歌声传递\n你是否会听见我心里的真心实意",
4
+ "audio_duration": 170.63997916666668,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 3.191075086593628,
19
+ "diffusion": 17.459356784820557,
20
+ "latent2audio": 1.7095518112182617
21
+ },
22
+ "actual_seeds": [
23
+ 3299954530
24
+ ]
25
+ }
examples/default/input_params/output_20250426071812_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "country rock, folk rock, southern rock, bluegrass, country pop",
3
+ "lyrics": "[verse]\nWoke up to the sunrise glow\nTook my heart and hit the road\nWheels hummin' the only tune I know\nStraight to where the wildflowers grow\n\n[verse]\nGot that old map all wrinkled and torn\nDestination unknown but I'm reborn\nWith a smile that the wind has worn\nChasin' dreams that can't be sworn\n\n[chorus]\nRidin' on a highway to sunshine\nGot my shades and my radio on fine\nLeave the shadows in the rearview rhyme\nHeart's racing as we chase the time\n\n[verse]\nMet a girl with a heart of gold\nTold stories that never get old\nHer laugh like a tale that's been told\nA melody so bold yet uncontrolled\n\n[bridge]\nClouds roll by like silent ghosts\nAs we drive along the coast\nWe toast to the days we love the most\nFreedom's song is what we post\n\n[chorus]\nRidin' on a highway to sunshine\nGot my shades and my radio on fine\nLeave the shadows in the rearview rhyme\nHeart's racing as we chase the time",
4
+ "audio_duration": 224.23997916666667,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 4.262240648269653,
19
+ "diffusion": 15.380569219589233,
20
+ "latent2audio": 2.3227272033691406
21
+ },
22
+ "actual_seeds": [
23
+ 401640
24
+ ]
25
+ }
examples/default/input_params/output_20250426072346_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "hip-house, funk",
3
+ "lyrics": "[verse]\n哎呀跳起来,脚尖踩节拍 (oo-yeah!)\n灯光闪烁像星星盛开 (uh-huh!)\n人人都醒来,把烦恼踹开 (get it!)\n热血沸腾,汗水自己安排\n\n[chorus]\n嘿,你还等啥?快抓住节拍 (come on!)\n光芒指引,让心都不存在 (whoa!)\n点燃热火,我们一起飙high (let’s go!)\n跳入午夜的狂欢时代\n\n[bridge]\n咚咚鼓声啊,让你的灵魂起飞 (woo!)\n手心拍一拍,能量翻倍 (ah-hah!)\n键盘响起来,如宇宙的交汇 (oh yeah!)\n就是这感觉,兄弟姐妹都陶醉\n\n[verse]\n灵魂从不睡,只想继续燃烧 (woo!)\n节奏像热浪,席卷这街道 (ow!)\n大伙儿涌上楼台,满面微笑 (yeah!)\n这一刻属于我们,无可替代\n\n[chorus]\n嘿,你还等啥?快抓住节拍 (come on!)\n光芒指引,让心都不存在 (whoa!)\n点燃热火,我们一起飙high (let’s go!)\n跳入午夜的狂欢时代\n\n[verse]\n世界多精彩,握紧把它打开 (alright!)\n每一步都像星球在摇摆 (uh-huh!)\n无边无际的律动像大海 (oo-yeah!)\n跟着光芒之舞,一起澎湃",
4
+ "audio_duration": 204.19997916666668,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.05196118354797363,
19
+ "diffusion": 15.530808210372925,
20
+ "latent2audio": 2.5604095458984375
21
+ },
22
+ "actual_seeds": [
23
+ 401640
24
+ ]
25
+ }
examples/default/input_params/output_20250426072508_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "funk, pop, soul, rock, melodic, guitar, drums, bass, keyboard, percussion, 105 BPM, energetic, upbeat, groovy, vibrant, dynamic",
3
+ "lyrics": "[verse]\nNeon lights they flicker bright\nCity hums in dead of night\nRhythms pulse through concrete veins\nLost in echoes of refrains\n\n[verse]\nBassline groovin' in my chest\nHeartbeats match the city's zest\nElectric whispers fill the air\nSynthesized dreams everywhere\n\n[chorus]\nTurn it up and let it flow\nFeel the fire let it grow\nIn this rhythm we belong\nHear the night sing out our song\n\n[verse]\nGuitar strings they start to weep\nWake the soul from silent sleep\nEvery note a story told\nIn this night we’re bold and gold\n\n[bridge]\nVoices blend in harmony\nLost in pure cacophony\nTimeless echoes timeless cries\nSoulful shouts beneath the skies\n\n[verse]\nKeyboard dances on the keys\nMelodies on evening breeze\nCatch the tune and hold it tight\nIn this moment we take flight",
4
+ "audio_duration": 178.87997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.02882218360900879,
19
+ "diffusion": 16.91233205795288,
20
+ "latent2audio": 1.7794082164764404
21
+ },
22
+ "actual_seeds": [
23
+ 401640
24
+ ]
25
+ }
examples/default/input_params/output_20250426073829_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "electronic rap",
3
+ "lyrics": "[verse]\nWaves on the bass, pulsing in the speakers,\nTurn the dial up, we chasing six-figure features,\nGrinding on the beats, codes in the creases,\nDigital hustler, midnight in sneakers.\n\n[chorus]\nElectro vibes, hearts beat with the hum,\nUrban legends ride, we ain't ever numb,\nCircuits sparking live, tapping on the drum,\nLiving on the edge, never succumb.\n\n[verse]\nSynthesizers blaze, city lights a glow,\nRhythm in the haze, moving with the flow,\nSwagger on stage, energy to blow,\nFrom the blocks to the booth, you already know.\n\n[bridge]\nNight's electric, streets full of dreams,\nBass hits collective, bursting at seams,\nHustle perspective, all in the schemes,\nRise and reflective, ain't no in-betweens.\n\n[verse]\nVibin' with the crew, sync in the wire,\nGot the dance moves, fire in the attire,\nRhythm and blues, soul's our supplier,\nRun the digital zoo, higher and higher.\n\n[chorus]\nElectro vibes, hearts beat with the hum,\nUrban legends ride, we ain't ever numb,\nCircuits sparking live, tapping on the drum,\nLiving on the edge, never succumb.",
4
+ "audio_duration": 221.42547916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.024875164031982422,
19
+ "diffusion": 20.566852569580078,
20
+ "latent2audio": 2.2281734943389893
21
+ },
22
+ "actual_seeds": [
23
+ 401640
24
+ ]
25
+ }
examples/default/input_params/output_20250426074037_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "electronic, house, electro house, synthesizer, drums, bass, percussion, fast, energetic, uplifting, exciting",
3
+ "lyrics": "[verse]\n霓虹灯下我们追逐\n人群跃动像潮水满布\n热浪袭来吹散孤独\n跳进节奏不如停下脚步\n\n[pre-chorus]\n脚尖触电快点感受\n迎着风声释放自由\n心跳节拍配合节奏\n一切烦恼请靠边游\n\n[chorus]\n夏夜狂奔没有尽头\n星光闪烁舞池不朽\n尽情挥洒所有节奏\n无边热情把你包裹哦\n\n[verse]\n天空翻滚黑云入夜\n每颗星星像音乐律贴\n耳边回响那低音线\n环绕耳际如梦境般甜\n\n[pre-chorus]\n脚尖触电快点感受\n迎着风声释放自由\n心跳节拍配合节奏\n一切烦恼请靠边游\n\n[chorus]\n夏夜狂奔没有尽头\n星光闪烁舞池不朽\n尽情挥洒所有节奏\n无边热情把你包裹哦",
4
+ "audio_duration": 221.47997916666668,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.028400182723999023,
19
+ "diffusion": 13.195815324783325,
20
+ "latent2audio": 2.1679723262786865
21
+ },
22
+ "actual_seeds": [
23
+ 3440445703
24
+ ]
25
+ }
examples/default/input_params/output_20250426074214_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "synth-pop, electronic, pop, synthesizer, drums, bass, piano, 128 BPM, energetic, uplifting, modern",
3
+ "lyrics": "[verse]\nWoke up in a city that's always alive\nNeon lights they shimmer they thrive\nElectric pulses beat they drive\nMy heart races just to survive\n\n[chorus]\nOh electric dreams they keep me high\nThrough the wires I soar and fly\nMidnight rhythms in the sky\nElectric dreams together we’ll defy\n\n[verse]\nLost in the labyrinth of screens\nVirtual love or so it seems\nIn the night the city gleams\nDigital faces haunted by memes\n\n[chorus]\nOh electric dreams they keep me high\nThrough the wires I soar and fly\nMidnight rhythms in the sky\nElectric dreams together we’ll defy\n\n[bridge]\nSilent whispers in my ear\nPixelated love serene and clear\nThrough the chaos find you near\nIn electric dreams no fear\n\n[verse]\nBound by circuits intertwined\nLove like ours is hard to find\nIn this world we’re truly blind\nBut electric dreams free the mind",
4
+ "audio_duration": 221.27997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.025463581085205078,
19
+ "diffusion": 15.243804454803467,
20
+ "latent2audio": 2.170398473739624
21
+ },
22
+ "actual_seeds": [
23
+ 3400270027
24
+ ]
25
+ }
examples/default/input_params/output_20250426074413_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "Cuban music, salsa, son, Afro-Cuban, traditional Cuban",
3
+ "lyrics": "[verse]\nSun dips low the night ignites\nBassline hums with gleaming lights\nElectric guitar singing tales so fine\nIn the rhythm we all intertwine\n\n[verse]\nDrums beat steady calling out\nPercussion guides no room for doubt\nElectric pulse through every vein\nDance away every ounce of pain\n\n[chorus]\nFeel the rhythm feel the flow\nLet the music take control\nBassline deep electric hum\nIn this night we're never numb\n\n[bridge]\nStars above they start to glow\nEchoes of the night's soft glow\nElectric strings weave through the air\nIn this moment none compare\n\n[verse]\nHeartbeats sync with every tone\nLost in music never alone\nElectric tales of love and peace\nIn this groove we find release\n\n[chorus]\nFeel the rhythm feel the flow\nLet the music take control\nBassline deep electric hum\nIn this night we're never numb",
4
+ "audio_duration": 208.27997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.026132583618164062,
19
+ "diffusion": 15.139378070831299,
20
+ "latent2audio": 2.2071540355682373
21
+ },
22
+ "actual_seeds": [
23
+ 3358899399
24
+ ]
25
+ }
examples/default/input_params/output_20250426075107_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "pop, piano, rap, dark, atmospheric",
3
+ "lyrics": "[verse]\n月光爬上窗 染白冷的床\n心跳的方向 带我入迷惘\n黑夜吞噬光 命运的纸张\n爱是血色霜 邪恶又芬芳\n\n[chorus]\n你是猎人的欲望 我是迷途的小羊\n深陷你眼眸的荒 唐突献出心脏\n我在夜里回荡 是谁给我希望\n黑暗风中飘荡 假装不再受伤\n\n[verse]\n心锁在门外 谁会解开关怀\n温柔的手拍 藏着冷酷杀害\n思绪如尘埃 撞击爱的霹雳\n灵魂的独白 为你沾满血迹\n\n[bridge]\n你是噩梦的歌唱 是灵魂的捆绑\n绝望中带着光 悬崖边的渴望\n心跳被你鼓掌 恶魔也痴痴想\n渐渐没了抵抗 古老诡计流淌\n\n[chorus]\n你是猎人的欲望 我是迷途的小羊\n深陷你眼眸的荒 唐突献出心脏\n我在夜里回荡 是谁给我希望\n黑暗风中飘荡 假装不再受伤\n\n[outro]\n爱如月黑无光 渗进梦的战场\n逃入无声的场 放手或心嚷嚷\n隐秘的极端 爱是极致风浪\n灵魂彻底交偿 你是终极虚妄",
4
+ "audio_duration": 146.91997916666668,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.03876018524169922,
19
+ "diffusion": 15.962624549865723,
20
+ "latent2audio": 1.4594337940216064
21
+ },
22
+ "actual_seeds": [
23
+ 2065110378
24
+ ]
25
+ }
examples/default/input_params/output_20250426075537_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "surf music",
3
+ "lyrics": "[verse]\nSunshine on the boulevard the beach is calling loud\nWaves are dancing golden sand under a cotton cloud\nElectric heartbeat pounding fast the tide is on our side\nCatch a wave and feel alive we’ll take it for a ride\n\n[verse]\nPalm trees swaying left to right they know where we belong\nFeel the rhythm of the night it keeps us moving strong\nSea spray kisses salty air we’re flying with the breeze\nChampagne states of mind we ride we do just as we please\n\n[chorus]\nWe’re riding waves of life together hand in hand\nWith every beat we chase the beat it’s our own wonderland\nFeel the music take you higher as the shorelines blur\nThis is our world our endless summer as we live and learn\n\n[bridge]\nMoonlight paints the ocean blue reflections in our eyes\nStars align to light our path we’re surfing through the skies\nEvery moment like a song we sing it loud and clear\nEvery day’s a new adventure with you always near\n\n[verse]\nNeon lights and city sounds they blend with ocean views\nWe’re unstoppable tonight no way that we can lose\nDreams are written in the sand they sparkle in the sun\nTogether we’re a masterpiece our story’s just begun\n\n[chorus]\nWe’re riding waves of life together hand in hand\nWith every beat we chase the beat it’s our own wonderland\nFeel the music take you higher as the shorelines blur\nThis is our world our endless summer as we live and learn",
4
+ "audio_duration": 236.55997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.033666133880615234,
19
+ "diffusion": 16.291455507278442,
20
+ "latent2audio": 2.3726775646209717
21
+ },
22
+ "actual_seeds": [
23
+ 508630535
24
+ ]
25
+ }
examples/default/input_params/output_20250426075843_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "alternative rock, pop, rock",
3
+ "lyrics": "[verse]\nBright lights flashing in the city sky\nRunning fast and we don't know why\nElectric nights got our hearts on fire\nChasing dreams we'll never tire\n\n[verse]\nGrit in our eyes wind in our hair\nBreaking rules we don't even care\nShouting loud above the crowd\nLiving life like we're unbowed\n\n[chorus]\nRunning wild in the night so free\nFeel the beat pumping endlessly\nHearts collide in the midnight air\nWe belong we don't have a care\n\n[verse]\nPiercing through like a lightning strike\nEvery moment feels like a hike\nDaring bold never backing down\nKings and queens without a crown\n\n[chorus]\nRunning wild in the night so free\nFeel the beat pumping endlessly\nHearts collide in the midnight air\nWe belong we don't have a care\n\n[bridge]\nClose your eyes let your spirit soar\nWe are the ones who wanted more\nBreaking chains of the mundane\nIn this world we'll make our claim",
4
+ "audio_duration": 202.19997916666668,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.02512216567993164,
19
+ "diffusion": 18.860822677612305,
20
+ "latent2audio": 2.0361969470977783
21
+ },
22
+ "actual_seeds": [
23
+ 1255121549
24
+ ]
25
+ }
examples/default/input_params/output_20250426080234_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "rock, hip - hop, orchestral, bass, drums, electric guitar, piano, synthesizer, violin, viola, cello, fast, energetic, motivational, inspirational, empowering",
3
+ "lyrics": "### **[Intro – Spoken]** \n*\"The streets whisper, their echoes never fade. \nEvery step I take leaves a mark—this ain't just a game.\"* \n\n### **[Hook/Chorus]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Verse 1]** \nCold nights, empty pockets, dreams laced with fight, \nEvery loss made me sharper, cut deep like a knife. \nThey said I wouldn’t make it, now they watch in despair, \nFrom the curb to the throne, took the pain, made it rare. \nEvery siren’s a melody, every alley holds a tale, \nRose from the shadows, left my name on the trail. \nStreetlights flicker like warnings in the haze, \nBut I move like a phantom, unfazed by the blaze. \n\n### **[Hook/Chorus]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Verse 2]** \nBarbed wire fences couldn't lock in my mind, \nEvery cage they designed, I left broken behind. \nThey want control, but I’m destined to roam, \nWhere the lost find their voice, where the heart sets the tone. \nSteel and concrete, where the lessons run deep, \nEvery crack in the pavement tells a story of heat. \nBut I rise, undefeated, like a king with no throne, \nWriting scripts in the struggle, my legacy’s stone. \n\n### **[Bridge]** \nFeel the rhythm of the underground roar, \nEvery wound tells a story of the battles before. \nBlood, sweat, and echoes fill the cold midnight, \nBut we move with the fire—unshaken, upright. \n\n### **[Verse 3]** \nNo regrets, no retreat, this game has no pause, \nEvery step that I take is a win for the lost. \nI took lessons from hustlers, wisdom from pain, \nNow the echoes of struggle carve power in my name. \nThey built walls, but I walk through the cracks, \nTurned dirt into gold, never looked back. \nThrough the struggle we rise, through the fire we claim, \nThis is more than just music—it's life in the frame. \n\n### **[Hook/Chorus – Reprise]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Outro – Spoken]** \n*\"The scars, the struggle, the grind—it’s all part of the rhythm. \nWe never break, we never fold. We rise.\"*",
4
+ "audio_duration": 153.95997916666667,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.04368758201599121,
19
+ "diffusion": 17.16369390487671,
20
+ "latent2audio": 1.5405471324920654
21
+ },
22
+ "actual_seeds": [
23
+ 2659225017
24
+ ]
25
+ }
examples/default/input_params/output_20250426080407_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "tango finlandés, campanas, disco, dark pop, electro, guitarra clásica, corridos tumba",
3
+ "lyrics": "[inst]",
4
+ "audio_duration": 162.79997916666667,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.011058568954467773,
19
+ "diffusion": 9.924944400787354,
20
+ "latent2audio": 1.6034839153289795
21
+ },
22
+ "actual_seeds": [
23
+ 780297686
24
+ ]
25
+ }
examples/default/input_params/output_20250426080601_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "Nightclubs, dance parties, workout playlists, radio broadcasts",
3
+ "lyrics": "Burning in motion, set me alight!\nEvery heartbeat turns into a fight!\nCaged in rhythm, chained in time!\nLove’s a battle— You're Mine! You're Mine!",
4
+ "audio_duration": 221.83997916666667,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.012485980987548828,
19
+ "diffusion": 14.345409154891968,
20
+ "latent2audio": 2.174558639526367
21
+ },
22
+ "actual_seeds": [
23
+ 1318394052
24
+ ]
25
+ }
examples/default/input_params/output_20250426081134_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "melancholic, world, sad, medieval, soulful",
3
+ "lyrics": "[Verse]\nIn a world so grand he roams the skies alone\nHis heart a heavy stone a tale untold\nWhispers of his past echo through the night\nA lonely dragon searching for the light\n\n[Verse 2]\nOnce a mighty force now he drifts in pain\nHis scales once shimmered now they're dark with shame\nCast out by his kin in shadows he does hide\nA haunting sorrow burns deep inside\n\n[Chorus]\nRoaming endless fields with no friend in sight\nHis roar a mournful cry beneath the moon's pale light\nTears fall like stars as he flies on his way\nA lonely dragon yearning for the break of day\n\n[Bridge]\nThe world turns cold the nights grow long\nIn his heart he carries an ancient song\nOf battles fought and love long gone\nA legend now but his soul is torn\n\n[Verse 3]\nHoping for a day he'll find a kindred soul\nTo share his pain and make him whole\nTill then he drifts a shadow in the sky\nA lonely dragon with tears in his eye\n\n[Chorus]\nRoaming endless fields with no friend in sight\nHis roar a mournful cry beneath the moon's pale light\nTears fall like stars as he flies on his way\nA lonely dragon yearning for the break of day",
4
+ "audio_duration": 239.99997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.029100656509399414,
19
+ "diffusion": 22.503791570663452,
20
+ "latent2audio": 2.3603708744049072
21
+ },
22
+ "actual_seeds": [
23
+ 2166832218
24
+ ]
25
+ }
examples/default/input_params/output_20250426091716_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "anime, cute female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted",
3
+ "lyrics": "[Chorus]\nねぇ、顔が赤いよ?\nどうしたの? 熱があるの?\nそれとも怒ってるの?\nねぇ、言ってよ!\n\nどうしてそんな目で見るの?\n私、悪いことした?\n何か間違えたの?\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge]\n目を閉じて、くるっと背を向けて、\n何も見なかったフリするから、\n怒らないで… 許してよ…\n\n[Chorus]\nねぇ、顔が赤いよ?\nどうしたの? 熱があるの?\nそれとも怒ってるの?\nねぇ、言ってよ!\n\nどうしてそんな目で見るの?\n私、悪いことした?\n何か間違えたの?\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge 2]\n待って、もし私が悪いなら、\nごめんなさいって言うから、\nアイスクリームあげるから、\nもう怒らないで?\n\nOoooh… 言ってよ!",
4
+ "audio_duration": 160,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.0282442569732666,
19
+ "diffusion": 12.104875326156616,
20
+ "latent2audio": 1.587641954421997
21
+ },
22
+ "actual_seeds": [
23
+ 4028738662
24
+ ]
25
+ }
examples/default/input_params/output_20250426092025_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "dark, death rock, metal, hardcore, electric guitar, powerful, bass, drums, 110 bpm, G major",
3
+ "lyrics": "[Verse]\nMy lovers betray me\nThe snake in my garden is hissing\nIn the air is the sweetness of roses\nAnd under my skin\nThere's a thorn\n\n[Verse 2]\nI should have known\nThat God sends his angel in shadows\nWith blood in his veins\nI watch the enemy\nGivin' me the hand of my savior\n\n[Chorus]\nAnd I can't love again\nWith the echo of your name in my head\nWith the demons in my bed\nWith the memories\nYour ghost\nI see it\n'Cause it comes to haunt me\nJust to taunt me\nIt comes to haunt me\nJust to taunt me\n\n[Verse 3]\nWith sugar and spice\nIt's hard to ignore the nostalgia\nWith the men on their knees\nAt the gates of my heart\nHow they beg me\n\n[Verse 4]\nThey say\n\"No one will ever love you\nThe way that I do\nNo one will ever touch you\nThe way that I do\"\n\n[Chorus]\nAnd I can't love again\nWith the echo of your name in my head\nWith the demons in my bed\nWith the memories\nYour ghost\nI see it\n'Cause it comes to haunt me\nJust to taunt me\nIt comes to haunt me\nJust to taunt me",
4
+ "audio_duration": 174.27997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 3.8372838497161865,
19
+ "diffusion": 13.039669275283813,
20
+ "latent2audio": 1.7923030853271484
21
+ },
22
+ "actual_seeds": [
23
+ 4064916393
24
+ ]
25
+ }
examples/default/input_params/output_20250426093007_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "aggressive, Heavy Riffs, Blast Beats, Satanic Black Metal",
3
+ "lyrics": "[verse]\nFloating through the galaxy on a midnight ride\nStars are dancing all around in cosmic tides\nFeel the pulse of space and time beneath our feet\nEvery beat a heartbeat in this endless suite\n\n[chorus]\nGalactic dreams under neon lights\nSailing through the velvet nights\nWe are echoes in a cosmic sea\nIn a universe where we are free\n\n[verse]\nPlanetary whispers in the sky tonight\nEvery constellation's got a secret sight\nDistant worlds and moons we have yet to see\nIn the void of space where we can just be\n\n[bridge]\nAsteroids and comets in a ballet they spin\nLost in the rhythm of where our dreams begin\nClose your eyes and let the synths take flight\nWe're voyagers on an electric night\n\n[verse]\nLet the piano keys unlock the stars above\nEvery chord a memory every note is love\nIn this synth symphony we find our grace\nDrifting forever in this boundless space\n\n[chorus]\nGalactic dreams under neon lights\nSailing through the velvet nights\nWe are echoes in a cosmic sea\nIn a universe where we are free",
4
+ "audio_duration": 181.99997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.025065898895263672,
19
+ "diffusion": 17.176705837249756,
20
+ "latent2audio": 1.8225171566009521
21
+ },
22
+ "actual_seeds": [
23
+ 1132623236
24
+ ]
25
+ }
examples/default/input_params/output_20250426093146_0_input_params.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "r&b, soul, funk/soul",
3
+ "lyrics": "[verse]\nDancing through electric fires\nHeart is buzzing like live wires\nIn your arms I find desire\nFeel the beat as we get higher\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why\n\n[verse]\nWhisper secrets that make me blush\nUnder the neon city hush\nYour touch gives me such a rush\nTurn it up we're feeling lush\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why\n\n[bridge]\nThrough the lights and the smoky haze\nI see you in a thousand ways\nLove's a script and we’re the play\nTurn the page stay till we sway\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why",
4
+ "audio_duration": 195.15997916666666,
5
+ "infer_step": 60,
6
+ "guidance_scale": 15,
7
+ "scheduler_type": "euler",
8
+ "cfg_type": "apg",
9
+ "omega_scale": 10,
10
+ "guidance_interval": 0.5,
11
+ "guidance_interval_decay": 0,
12
+ "min_guidance_scale": 3,
13
+ "use_erg_tag": true,
14
+ "use_erg_lyric": true,
15
+ "use_erg_diffusion": true,
16
+ "oss_steps": [],
17
+ "timecosts": {
18
+ "preprocess": 0.025553464889526367,
19
+ "diffusion": 18.250118494033813,
20
+ "latent2audio": 1.9400627613067627
21
+ },
22
+ "actual_seeds": [
23
+ 2853131993
24
+ ]
25
+ }
examples/zh_rap_lora/input_params/output_20250512101839_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
3
+ "task": "text2music",
4
+ "prompt": "Rap, adult, male, spoken word, singing, bright, energetic, clear",
5
+ "lyrics": "[Intro]\n他们说我来自阴影里\n说我的肤色是原罪的印记\n\n[Verse]\n眼神像刀子刮过 穿透我的皮肤\n带着审判和偏见 让我无处可逃处\n你没听过我的故事 没走过我的路\n凭什么就下一个判决 把我划出你的版图\n你说我威胁到你 抢走了你的机会\n可你可知我付出的 是你不敢想象的血泪\n被贴上标签 被区别对待\n呼吸都是错的 只因我生来就不一样态\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Verse]\n每一次努力争取 都会被审视被放大\n每一个细微的错误 都变成攻击的靶\n他们选择性失明 看不见我的汗水\n只看见他们想看的 带着恶意的定位\n系统性的歧视 像一张无形的网\n把我困在原地 无法自由地翱翔\n他们在享受特权 却指责我的贫困\n嘲笑我的口音 我的名字 我的出身\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Bridge]\n我不想寻求同情 只想被公平对待\n不想被定义被束缚 有选择自己未来的权利\n什么时候 才能放下心中的成见\n看到真正的我 而不是你脑海里的画面\n\n[Outro]\n画面... 不安...\n偏见... 歧视...\n什么时候能停止...",
6
+ "audio_duration": 134.64,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.3,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.032018184661865234,
21
+ "diffusion": 13.275121927261353,
22
+ "latent2audio": 1.291429042816162
23
+ },
24
+ "actual_seeds": [
25
+ 3826585269
26
+ ],
27
+ "retake_seeds": [
28
+ 2907904223
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512101839_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512114703_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
3
+ "task": "text2music",
4
+ "prompt": "Chorus Hook, Melodic Rap, Ambient Synth Pads, adult, rap, Very Fast, Storytelling, Chinese Rap, male, spoken word, bright, energetic, Melodic Flow, clear, clarity, 130 bpm",
5
+ "lyrics": "[Intro]\n舌 头 打 结 了... 快 念 快 念...\n\n[Verse 1]\n这 个 赌 鬼 蹲 在 柜 台 啃 着 苦 瓜 干 快 很 干\n赌 桌 堆 满 骨 牌 古 怪 股 票 和 五 块 钢 镚 儿 钢 镚\n他 甩 出 扑 克 牌 啪 啪 啪 拍 扁 螃 蟹 壳 哦 壳 扁\n又 摸 摸 麻 将 摸 出 幺 鸡 摸 出 发 财 摸 出 一 条 蛇 蛇 蛇\n庄 家 咳 嗽 咳 破 锣 嗓 子 喊 开 开 开 快 开 开\n赌 鬼 咕 嘟 咕 嘟 灌 咖 啡 灌 到 筷 子 戳 穿 碗 快 戳 穿\n空 气 里 飘 着 锅 巴 味 混 合 隔 夜 的 酸 奶 罐 哦 酸\n输 光 裤 带 还 想 翻 盘 翻 成 煎 饼 摊 老 板 快 翻 盘\n\n[Chorus]\n赌 鬼 赌 鬼 哦 赌 鬼 赌 鬼 快 很 快\n舌 头 打 结 着 念 这 段 哦 这 段 绕 口 令 牌\n若 念 错 一 字 就 罚 你 哦 罚 你 吞 十 斤 海 带\n赌 场 规 矩 就 是 绕 晕 你 哦 绕 晕 你 快 很 快\n\n[Verse 2]\n他 掏 出 铜 板 抠 出 口 袋 最 后 一 颗 快 很 颗\n庄 家 哗 啦 哗 啦 摇 骰 子 摇 出 三 点 又 三 点 哦 三 点\n赌 鬼 急 得 咬 牙 切 齿 咬 到 舌 头 打 蝴 蝶 结 快 打 结\n还 想 押 上 祖 传 的 拖 鞋 拖 把 铁 锅 和 半 包 盐 盐 盐\n突 然 警 笛 嘀 嘟 嘀 嘟 吓 得 他 钻 进 垃 圾 罐 哦 垃 圾\n警 察 咔 嚓 咔 嚓 拍 照 拍 到 他 头 顶 菠 菜 叶 快 拍 照\n最 后 赌 鬼 蹲 监 狱 天 天 背 这 首 绕 口 令 哦 背 不 完\n若 背 错 一 句 就 加 刑 十 年 再 加 十 年 快 加 刑\n\n[Outro]\n舌 头 打 结 了... 赌 鬼 哭 了 哦...\n这 首 歌... 绕 死 人 了 哦...",
6
+ "audio_duration": 186.59997916666666,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.7,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.03011012077331543,
21
+ "diffusion": 21.696259260177612,
22
+ "latent2audio": 1.7648537158966064
23
+ },
24
+ "actual_seeds": [
25
+ 3776541388
26
+ ],
27
+ "retake_seeds": [
28
+ 4274500599
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512114703_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512115409_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
3
+ "task": "text2music",
4
+ "prompt": "electronic, hip-hop, rap, synthesizer, drums, vocals, fast, energetic, modern, uplifting, young adult, male, spoken word, singing, bright, energetic, clear, 140 bpm, female",
5
+ "lyrics": "[Verse 1]\n红鲤鱼绿鲤鱼,驴在河里追鲤鱼,\n驴追鲤鱼鱼躲驴,气得驴子直喘气。\n扁担长板凳宽,扁担绑在板凳边,\n扁担要绑板凳不让绑,扁担偏要绑上板凳面!\n\n[Chorus]\n绕口令,练嘴皮,\n说快说慢别迟疑,\n红鲤鱼驴扁担板凳,\n一口气念完算你赢!\n\n[Verse 2]\n四是四十是十,十四是十四四十是四十,\n谁说四十是十四,舌头打结别放肆。\n黑化肥会挥发,灰化肥也发黑,\n化肥混一起,黑灰不分嘴发废!\n\n[Chorus]\n绕口令,练嘴皮,\n说快说慢别迟疑,\n四十十四化肥灰,\n念错罚你唱十回!\n\n[Bridge]\n坡上立着一只鹅,坡下流着一条河,\n鹅要过河河渡鹅,河要渡鹅鹅笑河——\n到底谁更啰嗦?!\n\n[Outro]\n嘴皮子功夫别小瞧,\n绕口令rap我最飙,\n下次挑战准备好,\n舌头打结别求饶!",
6
+ "audio_duration": 123.2,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.7,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.026150941848754883,
21
+ "diffusion": 12.212433099746704,
22
+ "latent2audio": 1.1857895851135254
23
+ },
24
+ "actual_seeds": [
25
+ 1415752189
26
+ ],
27
+ "retake_seeds": [
28
+ 685932970
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512115409_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
3
+ "task": "text2music",
4
+ "prompt": "singing, bright, slightly nasal, energetic, spoken word, young adult, male, rap music",
5
+ "lyrics": "[Intro]\nYo, check it—speed demon, lyrical heat, uh!\nRatatat like a drum when the beat bumps, uh!\n\n[Verse 1]\nRapatapa tap tap, flash like a snap,\nRap tap tap, I don’t chat, I clap clap clap!\nFingers snap, flow don’t slack, rapataptaptap,\nSpit it fast, hit the gas, rap tap tap rap!\n\n[Pre-Chorus]\nBoom-bap, zoom past, leave ’em flat,\nRap taptaprapataptaptap—where ya at?\n\n[Chorus]\nRapatapa tap tap, yeah, I go brrrr,\nRap tap tap, make the crowd stir!\nRapataptaptap, no lag, just spit,\nRap taptaprapataptaptap—I’m lit!\n\n[Verse 2]\nTongue-twist, quick wrist, rapatapa boom,\nTap tap rap, leave ya stuck like glue-gum!\nNo slow-mo, turbo, rapataptaptap,\nRap tap rap, yeah, I clap clap clap!\n\n[Outro]\nRapatapa—TAP! Mic drop—that’s that.",
6
+ "audio_duration": 60,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.018491744995117188,
21
+ "diffusion": 8.084580898284912,
22
+ "latent2audio": 0.5694489479064941
23
+ },
24
+ "actual_seeds": [
25
+ 226581098
26
+ ],
27
+ "retake_seeds": [
28
+ 1603201617
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512120348_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512143242_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "ACE-Step/ACE-Step-v1-chinese-rap-LoRA",
3
+ "task": "text2music",
4
+ "prompt": "G-Funk, Hip Hop, Rap, Female Vocals, Melodic Rap, Summer, Laid-back Groove, Smooth Rhythm, Synthesizer Lead, Heavy Bassline, Groovy, West Coast Hip Hop",
5
+ "lyrics": "(Intro)\nOh yeah... \n\n(Verse 1)\n阳光下,沙滩排球场,一个身影跳跃\n小麦色,运动背心,闪耀活力四射\n她跳起扣杀,动作利落又巧妙\n汗水浸湿发梢,笑容比阳光更美好\n摇摆的节奏,是她的背景配乐\n每一次移动,都踩在鼓点上那么和谐\n我不由自主地停下脚步\n目光被她紧紧锁住\n\n(Chorus)\n沙滩排球女孩, 摇摆节拍下的身材\n无忧无虑的笑容,把我的心都填满\n想走上前去搭讪,嫌自己笨拙呆板\n这青春的气息,耀眼,灿烂!\n\n(Verse 3)\n她和队友击掌庆祝,笑声清脆悦耳\n拿起毛巾擦汗,不经意间瞥我一眼\n鼓起勇气走上前,假装问问时间\n她友好地回答,笑容灿烂没有敷衍\n聊了几句,发现彼此爱这摇摆音乐\n她眼中也闪过惊喜和亲切\n这共同点,让气氛变得融洽又热烈!\n夏天的故事,就这样开始了感觉真切!\n\n(Chorus)\n沙滩排球女孩, 摇摆节拍下的身材\n无忧无虑的笑容,把我的心都填满\n不再犹豫和等待,勇敢把脚步迈开\n这夏天的感觉,心跳,不断!",
6
+ "audio_duration": 93.93038,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.03020024299621582,
21
+ "diffusion": 9.942127704620361,
22
+ "latent2audio": 0.9470341205596924
23
+ },
24
+ "actual_seeds": [
25
+ 3826585299
26
+ ],
27
+ "retake_seeds": [
28
+ 2519711205
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512143242_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512145057_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "lyrical rap, young adult, female, rap flow, spoken word, ad-libs, bright, energetic, eat, Fast, Engaging, Energetic",
5
+ "lyrics": "[Intro]\n扁擔寬 板凳長 扁擔想綁在板凳上\n扁擔寬 板凳長 扁擔想綁在板凳上\n\n[Verse]\n倫敦 瑪莉蓮 買了 件 旗袍 送 媽媽\n莫斯科 的 夫司基 愛上 牛肉 麵 疙瘩\n各種 顏色 的 皮膚 各種 顏色 的 頭髮\n嘴裡念的 說的 開始 流行 中國話 (中國話)\n\n[Bridge]\n多少年 我們 苦練 英文 發音 和 文法 (yeah)\n這幾年 換他們 捲著 舌頭 學 平上去入 的 變化\n平平 仄仄 平平 仄\n好聰明 的 中國人 好優美 的 中國話\n\n[Verse]\n扁擔寬 板凳長 扁擔想綁在板凳上\n板凳不讓扁擔綁在板凳上 扁擔偏要綁在板凳上\n板凳偏偏不讓扁擔綁在那板凳上\n到底扁擔寬 還是板凳長?\n\n[Verse]\n哥哥弟弟坡前坐\n坡上臥著一隻鵝 坡下流著一條河\n哥哥說 寬寬的河 弟弟說 白白的鵝\n鵝要過河 河要渡鵝\n不知是那鵝過河 還是河渡鵝\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[Verse]\n紐約蘇珊娜開了間禪風 lounge bar\n柏林來的沃夫岡拿胡琴配著電吉他\n各種顏色的皮膚 各種顏色的頭髮\n嘴裡念的 說的 開始流行中國話 (中國話)\n\n[Bridge]\n多少年我們苦練英文發音和文法 (yeah)\n這幾年換他們捲著舌頭學平上去入的變化\n仄仄平平仄仄平\n好聰明的中國人 好優美的中國話\n\n[Verse]\n有個小孩叫小杜 上街打醋又買布\n買了布 打了醋 回頭看見鷹抓兔\n放下布 擱下醋 上前去追鷹和兔\n飛了鷹 跑了兔 灑了醋 濕了布\n\n[Verse]\n嘴說腿 腿說嘴\n嘴說腿 愛跑腿\n腿說嘴 愛賣嘴\n光動嘴 不動腿\n光動腿 不動嘴\n不如不長腿和嘴\n到底是那嘴說腿 還是腿說嘴?\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[outro]\n全世界都在學中國話 (在學中國話)\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 (讓他) 讓世界 (認真) 都認真聽話",
6
+ "audio_duration": 239.8355625,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.04363536834716797,
21
+ "diffusion": 18.706920385360718,
22
+ "latent2audio": 2.1645781993865967
23
+ },
24
+ "actual_seeds": [
25
+ 2364345905
26
+ ],
27
+ "retake_seeds": [
28
+ 2100914041
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512145057_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512152217_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap",
5
+ "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球",
6
+ "audio_duration": 239.8355625,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.05357813835144043,
21
+ "diffusion": 25.644447326660156,
22
+ "latent2audio": 2.1787476539611816
23
+ },
24
+ "actual_seeds": [
25
+ 3246571430
26
+ ],
27
+ "retake_seeds": [
28
+ 1352325167
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512152217_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512153616_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap, fast",
5
+ "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球",
6
+ "audio_duration": 183.23,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.046170711517333984,
21
+ "diffusion": 14.21678113937378,
22
+ "latent2audio": 2.685957193374634
23
+ },
24
+ "actual_seeds": [
25
+ 3072005931
26
+ ],
27
+ "retake_seeds": [
28
+ 562842491
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512153616_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512154907_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "articulate, spoken word, young adult, rap music, female, clear, energetic, warm",
5
+ "lyrics": "[Intro]\n\"System booting... 语言 模型 loading...\"\n\n[Verse 1]\n硅谷 那个 coder 调试 neural network\n北京 的 极客 训练 A I 写 report\n不同 架构 的 chip 不同 算法 的 war\n屏幕上 跑的 全是 machine learning (learning)\n\n[Bridge]\n多少年 我们 chase 摩尔 定律 的 trend (yeah)\n这两年 换他们 study 中文 N L P\nConvolution L S T M\n好烧脑 的 backprop 好暴力 的 big data\n\n[Verse 2]\nPython 强 say加加 刚 Python 调用 C++ 的 A P I\nsay加加 嫌 Python 太 slow Python 笑 C++ 太 hardcore\nL L V M 默默 generate 中间 code\n到底 interpreter 还是 compiler 屌?\n\n[Verse 3]\nP M 和 engineer\n白板 画满 flow chart 服务器 闪着 red light\nP M 说 add feature engineer 说 no way\n需求 变更 code 重构\n不知 是 P M 太 fly 还是 deadline 太 high\n\n[Chorus]\n全世界 都在 train neural network\nTransformer 的 paper 越来越 难 go through\n全世界 都在 tune 超参数\n我们 写的 bug 让 G P U 都 say no\n\n[Verse 4]\n柏林 hackathon demo blockchain contract\n上海 的 dev 用 federated learning 破 data wall\n各种 语言 的 error 各种 框架 的 doc\nterminal 里 滚的 全是 dependency 冲突\n\n[Bridge]\n曾以为 English 才是 coding 的 language (yeah)\n直到见 G P T 用 文言文 generate 正则 expression\nGradient explode\n好硬核 的 prompt 好头秃 的 debug road\n\n[Verse 5]\n有个 bug 叫 quantum\n测试 环境 run perfect 上线 立即就 crash\n查 log 看 monitor 发现是 thread 不同步\n改 sync 加 lock 慢 deadlock 更难办\n量子 computer 也解不开 这 chaos chain\n\n[Verse 6]\n你说 996 我说 007\n你说 福报 我说 burnout\nProduct 要 agile Boss 要 KPI\nCode 要 elegant deadline 是 tomorrow\n不如 直接 script 自动 submit 离职信\n\n[Outro]\n\"Warning: 内存 leak...core dumping...\"\n全世界 都在 train neural network (neural network)\nLoss 还没 converge 天已经亮\n全世界 都在 tune 超参数\n我们 写的 code (让它) 让 world (reboot) 都 reboot 无效",
6
+ "audio_duration": 179.12,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.062120914459228516,
21
+ "diffusion": 13.499217987060547,
22
+ "latent2audio": 1.6430137157440186
23
+ },
24
+ "actual_seeds": [
25
+ 1637990575
26
+ ],
27
+ "retake_seeds": [
28
+ 101283039
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512154907_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512161832_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "articulate, spoken word, young adult, rap music, male, clear, energetic, warm, relaxed, breathy, night club, auto-tune, mumble rap, trap",
5
+ "lyrics": "[verse]\n这 这 谁 又 在 派 对 喝 多\n我 的 脑 袋\n像 被 驴 踢 过\n不 对 劲\n舌 头 打 结 不 会 说\n你 来 挑 战 我 就 跪\n开 局 直 接 崩 溃\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草!\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来!\n\n[verse]\n这 这 谁 又 在 派 对 丢 人\n我 的 世 界\n已 经 彻 底 崩 溃\n没 有 完 美\n只 有 翻 车 现 场\n以 及 观 众 的 嘲 讽\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草!\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来!",
6
+ "audio_duration": 169.12,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.04321885108947754,
21
+ "diffusion": 14.026689767837524,
22
+ "latent2audio": 1.5587565898895264
23
+ },
24
+ "actual_seeds": [
25
+ 1905941472
26
+ ],
27
+ "retake_seeds": [
28
+ 3018484796
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512161832_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512164224_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "四川话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright",
5
+ "lyrics": "[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n老子 在 弄堂 斜起 走 想 拦路 的 先 报 名号\n我 早看透 你们 手抖 脚软\n只敢 网上 吠 现实 怂成 猫\n看 你们 混的 真 可怜 整天 蹲在 网吧 蹭 烟\n钱 赚不到 架 不敢打 还 学人 摆 大哥 脸\n\n[verse]\n叫 我 沪上 老 克勒 不是 拉菲 我 不 碰杯\n规矩 我 懒得 讲 太多 钞票 直接 拍 你 脸上 飞\n老子 耐心 差 门槛 高 你 找茬 等于 自 寻 烦恼\n要么 跪 要么 爬 最后 警告 只 说 一 遭\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n古巴 雪茄 在 指间 绕 代表 魔都 格调 必须 顶\nOG 在 你 够不到 的 高度 My bro 永远 在 顶层 盯\nCheck my vibe 不靠 大 金劳 留声机 放 周璇 和 白光\n爹妈 太 宠你 养出 巨婴 症 早晚 社会 教你 做人 经\n\n[verse]\n玩 说唱 小囡 太 年轻 要 比 flow 先去 练 气功\n廿年 磨 枪 才 亮 锋芒 我 三十六 招 收 你 入 瓮\n老子 存在 就是 打假 标\n多少 人 眼红 又 不敢 挑\n键盘 侠 的 狠话 像 棉花 糖\n见 真人 秒变 Hello Kitty 叫\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗",
6
+ "audio_duration": 135.92,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.038518667221069336,
21
+ "diffusion": 16.47420620918274,
22
+ "latent2audio": 2.5094873905181885
23
+ },
24
+ "actual_seeds": [
25
+ 2159904788
26
+ ],
27
+ "retake_seeds": [
28
+ 2403013980
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512164224_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512171227_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "ACE-Step/ACE-Step-v1-chinese-rap-LoRA",
3
+ "task": "text2music",
4
+ "prompt": "Rap, Chinese Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear",
5
+ "lyrics": "(Intro)\nLet's drift away...\n\n(Verse 1)\n现实是灰色的格子间,重复的工作,枯燥的报表 \n敲打着键盘,眼神却放空,意识早已挣脱了肉体的镣铐\n飘向窗外,飞过拥挤的街道,穿过云层,到达想象的群岛\n那里色彩斑斓,形状奇异,逻辑失效,一切都随心所欲地飘摇\n迷幻的鼓点,像心跳的变奏,忽快忽慢,难以预料\n抽象的采样,扭曲的人声,构建一个超现实的音景环绕\n我变成一只鸟,一条鱼,一束光,自由地变换形态和奔跑\n在这白日梦里,我无所不能,摆脱了所有现实的烦恼, feeling the afterglow\n\n(Chorus)\n意识漫游,逃离乏味的轨道 \n迷幻嘻哈的节拍,是白日梦的引导 \n抽象的世界,逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n\n(Verse 2)\n会议室里老板在讲话,声音模糊,像隔着水听不清道\n我的思绪,早已潜入深海,与发光的水母一起舞蹈\n或者飞向外太空,在星云间穿梭,探索未知的星球和轨道\n现实的规则,在这里被打破,物理定律也失去效劳\n白日梦是我的避难所,是精神的氧气罩\n在乏味的现实里,为我注入一点色彩和奇妙\n虽然短暂,虽然虚幻,但它让我能够喘息,重新把能量找到\n然后回到现实,继续扮演那个,循规蹈矩的角色,把梦藏好, keep the dream aglow\n\n(Chorus)\n意识漫游,逃离乏味的轨道\n迷幻嘻哈的节拍,是白日梦的引导\n抽象的世界,逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n",
6
+ "audio_duration": 153.7148,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.04823446273803711,
21
+ "diffusion": 13.158645629882812,
22
+ "latent2audio": 1.493880033493042
23
+ },
24
+ "actual_seeds": [
25
+ 2945962357
26
+ ],
27
+ "retake_seeds": [
28
+ 2676242300
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0.7,
32
+ "guidance_scale_lyric": 1.5,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512171227_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512171809_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
3
+ "task": "text2music",
4
+ "prompt": "J-Pop, Anime, kawaii future bass, Femal vocals, EDM, Boombap, Aggressive, Intense, Crisp Snare, Super Fast, Rap",
5
+ "lyrics": "[Intro]\nYo, 这是来自深渊的怒吼\n\n[Verse]\n指尖飞快刷新,屏幕又亮起\n渴望那点赞,像致命的氧气\n精心修饰的脸庞,完美到诡异\n背后隐藏的疲惫,谁又会在意\n光鲜亮丽的橱窗,贩卖着焦虑\n每个人都在表演,戴着虚伪面具\n比较的游戏,让人逐渐窒息\n迷失在数据洪流,找不到自己\n\n[Chorus]\n这流量的时代,真假早已分不清\n盲目追随潮流,丢掉了初心\n为了那点虚荣,灵魂在沉沦\n看不见的锁链,捆绑每个灵魂\n\n[Verse]\n滤镜下的生活,美得不切实际\n营造虚假繁荣,掩盖内心空虚\n他人的光环下,显得自己多余\n嫉妒和自卑,交织成悲剧\n\n[Chorus]\n朋友圈里炫耀,现实中却叹气\n刷着别人的故事,忘记了呼吸\n算法推荐着你,想看的一切东西\n不知不觉间,你已不再是你\n他们说这是进步,我看是种病\n精神鸦片侵蚀,慢慢要了你的命\n\n[Bridge]\n屏幕亮了又暗,一天又过去\n究竟得到了什么,还是失去了自己\n那真实的连接,在何处寻觅\n困在这迷宫里,找不到出口的轨迹\n\n[Outro]\n我想挣脱,我想呼吸\n这虚拟的繁华,让我喘不过气\n谁能告诉我,这到底有什么意义\n一切都像泡沫,一触就破裂没余地",
6
+ "audio_duration": 119.44348,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.04764962196350098,
21
+ "diffusion": 10.94297981262207,
22
+ "latent2audio": 1.1815783977508545
23
+ },
24
+ "actual_seeds": [
25
+ 3826585273
26
+ ],
27
+ "retake_seeds": [
28
+ 2527594022
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512171809_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250512172941_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
3
+ "task": "text2music",
4
+ "prompt": "Hip Hop, Hi-hat Rolls, spoken word, Melodic Flow, articulate, Female Rap, 120 BPM, clear, warm, female, melodic Rap, adult, super fast",
5
+ "lyrics": "[Verse 1]\n打南边来了个喇嘛,手里提拉着五斤鳎目,\n打北边来了个哑巴,腰里别着个喇叭。\n喇嘛想换哑巴的喇叭,哑巴摇头不说话,\n鳎目一甩像道闪电,喇叭一响震天涯!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!\n\n[Verse 2]\n墙上一根钉,钉下绳摇晃,\n绳吊着瓶,瓶碰碎了光。\n灯骂瓶,瓶怪绳,绳怨钉,\n稀里哗啦,一场荒唐!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!\n\n[Verse 3]\n板凳宽,扁担长,\n一个偏要绑,一个偏不让。\n青龙洞里龙翻身,\n千年大梦变稻香!\n\n[Bridge]\n麻婆婆的狗,咬破麻叉口,\n麻线穿针眼,补丁也风流。\n左一句,右一句,\n舌头打结心自由!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!",
6
+ "audio_duration": 214.12,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.031190156936645508,
21
+ "diffusion": 20.130417823791504,
22
+ "latent2audio": 1.9650826454162598
23
+ },
24
+ "actual_seeds": [
25
+ 1946426111
26
+ ],
27
+ "retake_seeds": [
28
+ 331383387
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250512172941_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250513044511_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
3
+ "task": "text2music",
4
+ "prompt": "东北话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright",
5
+ "lyrics": "[verse]\n挣着 憋屈的 工资 还得 装乐呵\n猫着 怂样儿 还搁 朋友圈 嘚瑟\n扛着 傻逼的 指标 没人 搭把手\n这儿 不是 托儿所 少整 那出儿 哭唧尿嚎\n\n俺们 就像 一条条 老板的 裤衩子\n陪着 笑脸 接他 每一回 突突\n哎呦 老板 今儿个 穿我呗\n他 撅个腚 眼角 瞟你 那熊样\n\n[chorus]\n他们 骂我 打工仔 太多人 没睡醒\n寻思 抠搜 老板 一天天 穷折腾\n不想 俺的 人生 烂在 这嘎达\n不想 俺的 将来 折在 这破棚\n\n老子 不想 上班 老子 是外星人\n你都 把俺 骂急眼了 俺还 这么淡定\n现实 才是 梦 啥时候 能醒啊\n那 糟践人的 答案 在西北风 里飘\n\n[verse]\n瞅见 二愣子 同事 给老板 舔腚沟子\n瞅见 浪蹄子 女同事 在老板 胯骨轴 扭搭\n瞅见 白瞎的 光阴 耗在 没亮儿的 道儿\n瞅见 公交车上 一帮 僵尸 吐酸水\n\n瞅见 俺的 命 定在 苦逼的 坑里\n瞅见 俺的 爱情 被轮了 成了 老处女\n瞅见 好事儿 全归 高富帅\n还有 那些 臭不要脸 扭腚的 货色\n\n[chorus](重复)\n他们 骂我 打工仔 太多人 没睡醒...\n\n[bridge]\n加班 没补助 俺认了\n欠薪 揍员工 把俺 当牲口\n去你妈 的小姘头\n\n[verse]\n破逼 管理制度 净整 娱乐八卦\n撸管式 管理 也就 你自己 嗨\n出点儿 屁事儿 就往 下属 脑瓜子 扣\n挣俩 钢镚儿 立马 牛逼 不分 公母\n\n你挖个 大坑 把俺们 往里 踹\n说这 叫梦想 你当年 多能耐\n俺们 就当 听传销 洗脑课\n可怜 连骗人 你都 就会 这一套\n\n[outro]\n老子 不想 上班\n老子 不想 上班\n老子 不想 上班",
6
+ "audio_duration": 135.92,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.06204533576965332,
21
+ "diffusion": 35.75483560562134,
22
+ "latent2audio": 1.5193355083465576
23
+ },
24
+ "actual_seeds": [
25
+ 4176354214
26
+ ],
27
+ "retake_seeds": [
28
+ 601086915
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250513044511_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250513050200_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
3
+ "task": "text2music",
4
+ "prompt": "Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear",
5
+ "lyrics": "[Intro]\nNya.\n\n[Verse]\n我 在 五 点 二 十 早 起,十 三 点 十 四 弹 会儿 琴\n习 惯 了 坐 班,习惯了 隔夜 的 剩 饭,\n习 惯 了 没有 你\n\n[Verse]\n怕 你 想 不 开,拦 在 你 的 面 前\n那 时 候 摔 得 差 点 住 院\n东 京 的 春 天 莺 莺 燕 燕\n我 说 想 不 想 来 跟 我 玩 音乐\n\n[Verse]\n带 着 我 的 朋 友 守 在 你 的 门 口\n弹 着 我 的 钢 琴 当 伴 奏\n等 你 放 学 后,陪 你 K T V\n端 着 我 的 红 茶 跟 你 碰 杯\n\n[Pre-Chorus]\n忽然间现实淹没了远方\n万家灯火,盖住月光\n奔走,忍受,变成了人偶\n别再对我伸出你的 双 手,会 受 伤\n\n[Chorus]\n明明都向前走,方向却渐渐不同\n时间让你我越走越近,却越来越陌生\n春 天 在 滂 沱 的 大 雨 里 飘 落\n得 了 心 太 高 脸 太 薄 的病\n\n[Bridge]\n我越难过,春日影越顶\n眼泪晃得我看不清\n埋葬了懦弱还有矫情\n却还是会在半夜摸眼睛\n\n青春期大部分时间在工 作\n用微笑换来余额几个零\n戴上了面具也明白了生活\n拼的是数字和脸更是命\n\n[Verse]\n我在五点二十早起,十三点十四弹会琴\n早上要做饭,回家时满地的瓶罐\n\n师 徒 二 人 站 在 我 的 面 前\n台 词 很 熟 练,照 着 就 念\n\n背 后 的 小 睦 扭 扭 捏 捏\n我 说 我 还 有 点 事 要 不 改 天 见\n\n然 后 你 的 双手 握 住 我 的 袖 口\n开 始 哭 着 求 我 不 要 走\n\n[Verse]\n我在下班后,忙活柴米油\n你和你的姐妹住着高楼\n\n苦 来 兮 苦,早 就 没 了\n现 实 扬 鞭,赶 着 我 向 前\n没有时间跟你分辨什么对与错\n\n[Bridge]\n没有什么对错,没有罪过\n谁不曾天真,是我太早看破\n生活一片狼藉,却又不想放弃\n一 边 聚 光 灯 下 绽 放,一 边 坠 落\n故作坚强,筑起心的墙\n越是委屈的伤口,越要藏\nLet it all out, it’s all right\n\n[Outro]\n俺 是 东 京 嘞,东 京 打 工 妹\n\n从虎之门带你转到浅草\n再从新宿转到竹桥\n\n俺 是 东 京 嘞,东 京 打 工 妹\n\n带 你 转 羽田 成田 蒲田 神田\n做 你 嘞 小 甜 甜!\n\n俺 是 东 京 嘞,东 京 打 工 妹\n带 你 转 赤 坂,带 你 转 霞 关\n恁 咋 不 早 说,今 天 不 管 饭\n",
6
+ "audio_duration": 147.62212,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.5,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.052134037017822266,
21
+ "diffusion": 17.909283876419067,
22
+ "latent2audio": 1.4904146194458008
23
+ },
24
+ "actual_seeds": [
25
+ 2945962357
26
+ ],
27
+ "retake_seeds": [
28
+ 2252292438
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0.7,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250513050200_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250513055451_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
3
+ "task": "text2music",
4
+ "prompt": "Rap, adult, male, spoken word, rapping, clear, warm, articulate, Lo-Fi Hip Hop, 100-120 BPM, Keyboard Chords, Male Rap, Lazy Rhythm, Melancholy, Rap",
5
+ "lyrics": "[Intro]\n夜色 很 淡 像 褪色 的 照片 \n但 记忆 却 像 刀锋 一样 锐利 \n\n[Verse 1]\n你 说过 的 甜言蜜语 现在 听来 像 最 恶毒 的 咒骂 \n你 刺进 我 心里 的 刀 现在 还 在 滴血 未 干 哪 \n慵懒 的 旋律 像 我 的 脚步 拖着 沉重 的 躯壳 \n脑海 里 循环 播放 那 画面 快 把 我 逼疯 了 \n键盘 和弦 低沉 又 忧伤 弹奏 着 我 的 绝望 \n我 曾经 的 信任 像 玻璃 一样 被 你 狠狠 地 摔 在 地上 \n不想 振作 不想 原谅 只 想 让 这 一切 都 停止 \n可 心底 有 个 声音 嘶吼 着 要 你 付出 该 有 的 代价 \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液 \n复仇 的 火焰 在 我 眼中 燃起 \n哪怕 遍体鳞伤 哪怕 万劫不复 \n我 也 要 亲手 撕碎 你 的 幸福 \n这 是 我 的 哀歌 也 是 我 的 战书 \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦 \n\n[Verse 2]\n曾经 的 兄弟 现在 面目全非 像 个 陌生人 \n你 的 自私 像 癌细胞 一点点 吞噬 我 的 纯真 \n我 学着 你 的 样子 把 心 锁 起来 不再 轻易 相信 \n让 懒散 的 节奏 包裹 我 给 自己 一点 喘息 \n键盘 的 音色 变得 更加 阴冷 像 秋天 的 雨滴 \n冲刷 掉 所有 温情 只 剩下 彻骨 的 寒意 \n我 不会 大喊大叫 只是 默默 地 计划 \n每 一步 都 走向 让 你 后悔 的 那 一 刹那 \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液 \n复仇 的 火焰 在 我 眼中 燃起 \n哪怕 遍体鳞伤 哪怕 万劫不复 \n我 也 要 亲手 撕碎 你 的 幸福 \n这 是 我 的 哀歌 也 是 我 的 战书 \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦 \n\n[Bridge]\n也许 复仇 不能 带来 平静 \n也许 只 会 让 我 更 堕落 \n但 如果 不 这样 做 \n我 连 活下去 的 勇气 都 没有 \n\n[Outro]\n复仇 复仇 复仇 \n直到 最后 一刻 \n懒散 地 复仇 着 ",
6
+ "audio_duration": 202.64,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.65,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 0.036400794982910156,
21
+ "diffusion": 23.055809259414673,
22
+ "latent2audio": 1.8787360191345215
23
+ },
24
+ "actual_seeds": [
25
+ 3900061002
26
+ ],
27
+ "retake_seeds": [
28
+ 3037373819
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250513055451_0.wav"
45
+ }
examples/zh_rap_lora/input_params/output_20250513060150_0_input_params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
3
+ "task": "text2music",
4
+ "prompt": "Orchestra, Symphony, Sonata, Opera, Concerto, Rap, Beat, DJ, MC, StreetCulture",
5
+ "lyrics": "[verse1]\n羊皮卷轴 墨香飘 莫扎特 熬 安魂曲 通宵 \n和弦齿轮 咔哒转 比 瑞士 手表 更 精密 律动 \n八轨磁带 玩叠叠乐 披头士 炸 录音棚 天花板 \nAI 卷起 新风暴 像 灭霸 打响指 般 简单 \n\n[chorus]\n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[verse2]\n春之祭 召唤 百人 乐团 才够 燥 \n合成器 极客 玩电焊 焊出 赛博 神庙 \nDAW 解放 双手 钢琴卷帘 变 乐高 \n音色库 开挂 像 吃 金币 的 马里奥 \n\nAI 拆解 爵士乐 黑话 像 庖丁 解牛 \nCityPop 复古 滤镜 直接 参数 调油 \n神经网络 偷师 贝多芬 半夜 翻墙头 \n音乐 基因库 被 改写成 超频 万花筒 \n\n[chorus] \n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[verse3] \n电子琴 被 吐槽 塑料 味 超标 \n卧室 制作人 用 鼠标 单挑 整个 乐团 编制 \nAI 伴奏 刚上线 就被 键盘侠 集火 \n却 忘了 电吉他 曾被 说 是 魔鬼 的 副歌 \n\n现在 我 指尖 蹦迪 在 数据 炼丹炉 \n提示词 召唤 莫扎特 跨次元 碰杯 珍珠奶茶 \n当 比特 海洋 淹没 所有 物理 琴柱 \n最后 的 音轨 永远 连着 心脏 的 跳针 \n\n[bridge] \n鹅毛笔 蘸着 银河 当 墨汁(绝了) \n音浪 在 元宇宙 开 分店(疯了) \n技术 迷雾 散成 像素 烟花 \n而 我们 始终 带着 老派 的 心跳 混搭 \n\n[chorus] \n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[outro] \n从 蒸汽 到 硅基 浪潮 我 冲浪(yo) \n用 脑洞 接住 每个 技术 暴击(叮) \n当 所有 设备 没电 的 凌晨 三点钟 \n最 原始 的 旋律 在 胸腔 敲击 成 龙卷风 ",
6
+ "audio_duration": 172.64,
7
+ "infer_step": 60,
8
+ "guidance_scale": 15,
9
+ "scheduler_type": "euler",
10
+ "cfg_type": "apg",
11
+ "omega_scale": 10,
12
+ "guidance_interval": 0.65,
13
+ "guidance_interval_decay": 0,
14
+ "min_guidance_scale": 3,
15
+ "use_erg_tag": true,
16
+ "use_erg_lyric": false,
17
+ "use_erg_diffusion": true,
18
+ "oss_steps": [],
19
+ "timecosts": {
20
+ "preprocess": 3.648996353149414,
21
+ "diffusion": 16.44967818260193,
22
+ "latent2audio": 1.614703893661499
23
+ },
24
+ "actual_seeds": [
25
+ 1198023141
26
+ ],
27
+ "retake_seeds": [
28
+ 3389016134
29
+ ],
30
+ "retake_variance": 0.5,
31
+ "guidance_scale_text": 0,
32
+ "guidance_scale_lyric": 0,
33
+ "repaint_start": 0,
34
+ "repaint_end": 0,
35
+ "edit_n_min": 0.0,
36
+ "edit_n_max": 1.0,
37
+ "edit_n_avg": 1,
38
+ "src_audio_path": null,
39
+ "edit_target_prompt": null,
40
+ "edit_target_lyrics": null,
41
+ "audio2audio_enable": false,
42
+ "ref_audio_strength": 0.5,
43
+ "ref_audio_input": null,
44
+ "audio_path": "./outputs/output_20250513060150_0.wav"
45
+ }
language_segmentation/LangSegment.py ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file bundles language identification functions.
3
+
4
+ Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
5
+
6
+ Original code: Copyright (c) 2011 Marco Lui <[email protected]>.
7
+ Based on research by Marco Lui and Tim Baldwin.
8
+
9
+ See LICENSE file for more info.
10
+ https://github.com/adbar/py3langid
11
+
12
+ Projects:
13
+ https://github.com/juntaosun/LangSegment
14
+ """
15
+
16
+ import os
17
+ import re
18
+ import sys
19
+ import numpy as np
20
+ from collections import Counter
21
+ from collections import defaultdict
22
+
23
+ # import langid
24
+ # import py3langid as langid
25
+ # pip install py3langid==0.2.2
26
+
27
+ # 启用语言预测概率归一化,概率预测的分数。因此,实现重新规范化 产生 0-1 范围内的输出。
28
+ # langid disables probability normalization by default. For command-line usages of , it can be enabled by passing the flag.
29
+ # For probability normalization in library use, the user must instantiate their own . An example of such usage is as follows:
30
+ from py3langid.langid import LanguageIdentifier, MODEL_FILE
31
+
32
+ # Digital processing
33
+ try:from .utils.num import num2str
34
+ except ImportError:
35
+ try:from utils.num import num2str
36
+ except ImportError as e:
37
+ raise e
38
+
39
+ # -----------------------------------
40
+ # 更新日志:新版本分词更加精准。
41
+ # Changelog: The new version of the word segmentation is more accurate.
42
+ # チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。
43
+ # Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다.
44
+ # -----------------------------------
45
+
46
+
47
+ # Word segmentation function:
48
+ # automatically identify and split the words (Chinese/English/Japanese/Korean) in the article or sentence according to different languages,
49
+ # making it more suitable for TTS processing.
50
+ # This code is designed for front-end text multi-lingual mixed annotation distinction, multi-language mixed training and inference of various TTS projects.
51
+ # This processing result is mainly for (Chinese = zh, Japanese = ja, English = en, Korean = ko), and can actually support up to 97 different language mixing processing.
52
+
53
+ #===========================================================================================================
54
+ #分かち書き機能:文章や文章の中の例えば(中国語/英語/日本語/韓国語)を、異なる言語で自動的に認識して分割し、TTS処理により適したものにします。
55
+ #このコードは、さまざまなTTSプロジェクトのフロントエンドテキストの多言語混合注釈区別、多言語混合トレーニング、および推論のために特別に作成されています。
56
+ #===========================================================================================================
57
+ #(1)自動分詞:「韓国語では何を読むのですかあなたの体育の先生は誰ですか?今回の発表会では、iPhone 15シリーズの4機種が登場しました」
58
+ #(2)手动分词:“あなたの名前は<ja>佐々木ですか?<ja>ですか?”
59
+ #この処理結果は主に(中国語=ja、日本語=ja、英語=en、韓国語=ko)を対象としており、実際には最大97の異なる言語の混合処理をサポートできます。
60
+ #===========================================================================================================
61
+
62
+ #===========================================================================================================
63
+ # 단어 분할 기능: 기사 또는 문장에서 단어(중국어/영어/일본어/한국어)를 다른 언어에 따라 자동으로 식별하고 분할하여 TTS 처리에 더 적합합니다.
64
+ # 이 코드는 프런트 엔드 텍스트 다국어 혼합 주석 분화, 다국어 혼합 교육 및 다양한 TTS 프로젝트의 추론을 위해 설계되었습니다.
65
+ #===========================================================================================================
66
+ # (1) 자동 단어 분할: "한국어로 무엇을 읽습니까? 스포츠 씨? 이 컨퍼런스는 4개의 iPhone 15 시리즈 모델을 제공합니다."
67
+ # (2) 수동 참여: "이름이 <ja>Saki입니까? <ja>?"
68
+ # 이 처리 결과는 주로 (중국어 = zh, 일본어 = ja, 영어 = en, 한국어 = ko)를 위한 것이며 실제로 혼합 처리를 위해 최대 97개의 언어를 지원합니다.
69
+ #===========================================================================================================
70
+
71
+ # ===========================================================================================================
72
+ # 分词功能:将文章或句子里的例如(中/英/日/韩),按不同语言自动识别并拆分,让它更适合TTS处理。
73
+ # 本代码专为各种 TTS 项目的前端文本多语种混合标注区分,多语言混合训练和推理而编写。
74
+ # ===========================================================================================================
75
+ # (1)自动分词:“韩语中的오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型”
76
+ # (2)手动分词:“你的名字叫<ja>佐々木?<ja>吗?”
77
+ # 本处理结果主要针对(中文=zh , 日文=ja , 英文=en , 韩语=ko), 实际上可支持多达 97 种不同的语言混合处理。
78
+ # ===========================================================================================================
79
+
80
+
81
+ # 手动分词标签规范:<语言标签>文本内容</语言标签>
82
+ # 수동 단어 분할 태그 사양: <언어 태그> 텍스트 내용</언어 태그>
83
+ # Manual word segmentation tag specification: <language tags> text content </language tags>
84
+ # 手動分詞タグ仕様:<言語タグ>テキスト内容</言語タグ>
85
+ # ===========================================================================================================
86
+ # For manual word segmentation, labels need to appear in pairs, such as:
87
+ # 如需手动分词,标签需要成对出现,例如:“<ja>佐々木<ja>” 或者 “<ja>佐々木</ja>”
88
+ # 错误示范:“你的名字叫<ja>佐々木。” 此句子中出现的单个<ja>标签将被忽略,不会处理。
89
+ # Error demonstration: "Your name is <ja>佐々木。" Single <ja> tags that appear in this sentence will be ignored and will not be processed.
90
+ # ===========================================================================================================
91
+
92
+
93
+ # ===========================================================================================================
94
+ # 语音合成标记语言 SSML , 这里只支持它的标签(非 XML)Speech Synthesis Markup Language SSML, only its tags are supported here (not XML)
95
+ # 想支持更多的 SSML 标签?欢迎 PR! Want to support more SSML tags? PRs are welcome!
96
+ # 说明:除了中文以外,它也可改造成支持多语种 SSML ,不仅仅是中文。
97
+ # Note: In addition to Chinese, it can also be modified to support multi-language SSML, not just Chinese.
98
+ # ===========================================================================================================
99
+ # 中文实现:Chinese implementation:
100
+ # 【SSML】<number>=中文大写数字读法(单字)
101
+ # 【SSML】<telephone>=数字转成中文电话号码大写汉字(单字)
102
+ # 【SSML】<currency>=按金额发音。
103
+ # 【SSML】<date>=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。
104
+ # ===========================================================================================================
105
+ class LangSSML:
106
+
107
+ def __init__(self):
108
+ # 纯数字
109
+ self._zh_numerals_number = {
110
+ '0': '零',
111
+ '1': '一',
112
+ '2': '二',
113
+ '3': '三',
114
+ '4': '四',
115
+ '5': '五',
116
+ '6': '六',
117
+ '7': '七',
118
+ '8': '八',
119
+ '9': '九'
120
+ }
121
+
122
+ # 将2024/8/24, 2024-08, 08-24, 24 标准化“年月日”
123
+ # Standardize 2024/8/24, 2024-08, 08-24, 24 to "year-month-day"
124
+ def _format_chinese_data(self, date_str:str):
125
+ # 处理日期格式
126
+ input_date = date_str
127
+ if date_str is None or date_str.strip() == "":return ""
128
+ date_str = re.sub(r"[\/\._|年|月]","-",date_str)
129
+ date_str = re.sub(r"日",r"",date_str)
130
+ date_arrs = date_str.split(' ')
131
+ if len(date_arrs) == 1 and ":" in date_arrs[0]:
132
+ time_str = date_arrs[0]
133
+ date_arrs = []
134
+ else:
135
+ time_str = date_arrs[1] if len(date_arrs) >=2 else ""
136
+ def nonZero(num,cn,func=None):
137
+ if func is not None:num=func(num)
138
+ return f"{num}{cn}" if num is not None and num != "" and num != "0" else ""
139
+ f_number = self.to_chinese_number
140
+ f_currency = self.to_chinese_currency
141
+ # year, month, day
142
+ year_month_day = ""
143
+ if len(date_arrs) > 0:
144
+ year, month, day = "","",""
145
+ parts = date_arrs[0].split('-')
146
+ if len(parts) == 3: # 格式为 YYYY-MM-DD
147
+ year, month, day = parts
148
+ elif len(parts) == 2: # 格式为 MM-DD 或 YYYY-MM
149
+ if len(parts[0]) == 4: # 年-月
150
+ year, month = parts
151
+ else:month, day = parts # 月-日
152
+ elif len(parts[0]) > 0: # 仅有月-日或年
153
+ if len(parts[0]) == 4:
154
+ year = parts[0]
155
+ else:day = parts[0]
156
+ year,month,day = nonZero(year,"年",f_number),nonZero(month,"月",f_currency),nonZero(day,"日",f_currency)
157
+ year_month_day = re.sub(r"([年|月|日])+",r"\1",f"{year}{month}{day}")
158
+ # hours, minutes, seconds
159
+ time_str = re.sub(r"[\/\.\-:_]",":",time_str)
160
+ time_arrs = time_str.split(":")
161
+ hours, minutes, seconds = "","",""
162
+ if len(time_arrs) == 3: # H/M/S
163
+ hours, minutes, seconds = time_arrs
164
+ elif len(time_arrs) == 2:# H/M
165
+ hours, minutes = time_arrs
166
+ elif len(time_arrs[0]) > 0:hours = f'{time_arrs[0]}点' # H
167
+ if len(time_arrs) > 1:
168
+ hours, minutes, seconds = nonZero(hours,"点",f_currency),nonZero(minutes,"分",f_currency),nonZero(seconds,"秒",f_currency)
169
+ hours_minutes_seconds = re.sub(r"([点|分|秒])+",r"\1",f"{hours}{minutes}{seconds}")
170
+ output_date = f"{year_month_day}{hours_minutes_seconds}"
171
+ return output_date
172
+
173
+ # 【SSML】number=中文大写数字读法(单字)
174
+ # Chinese Numbers(single word)
175
+ def to_chinese_number(self, num:str):
176
+ pattern = r'(\d+)'
177
+ zh_numerals = self._zh_numerals_number
178
+ arrs = re.split(pattern, num)
179
+ output = ""
180
+ for item in arrs:
181
+ if re.match(pattern,item):
182
+ output += ''.join(zh_numerals[digit] if digit in zh_numerals else "" for digit in str(item))
183
+ else:output += item
184
+ output = output.replace(".","点")
185
+ return output
186
+
187
+ # 【SSML】telephone=数字转成中文电话号码大写汉字(单字)
188
+ # Convert numbers to Chinese phone numbers in uppercase Chinese characters(single word)
189
+ def to_chinese_telephone(self, num:str):
190
+ output = self.to_chinese_number(num.replace("+86","")) # zh +86
191
+ output = output.replace("一","幺")
192
+ return output
193
+
194
+ # 【SSML】currency=按金额发音。
195
+ # Digital processing from GPT_SoVITS num.py (thanks)
196
+ def to_chinese_currency(self, num:str):
197
+ pattern = r'(\d+)'
198
+ arrs = re.split(pattern, num)
199
+ output = ""
200
+ for item in arrs:
201
+ if re.match(pattern,item):
202
+ output += num2str(item)
203
+ else:output += item
204
+ output = output.replace(".","点")
205
+ return output
206
+
207
+ # 【SSML】date=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。
208
+ def to_chinese_date(self, num:str):
209
+ chinese_date = self._format_chinese_data(num)
210
+ return chinese_date
211
+
212
+
213
+ class LangSegment:
214
+
215
+ def __init__(self):
216
+
217
+ self.langid = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
218
+
219
+ self._text_cache = None
220
+ self._text_lasts = None
221
+ self._text_langs = None
222
+ self._lang_count = None
223
+ self._lang_eos = None
224
+
225
+ # 可自定义语言匹配标签:カスタマイズ可能な言語対応タグ:사용자 지정 가능한 언어 일치 태그:
226
+ # Customizable language matching tags: These are supported,이 표현들은 모두 지지합니다
227
+ # <zh>你好<zh> , <ja>佐々木</ja> , <en>OK<en> , <ko>오빠</ko> 这些写法均支持
228
+ self.SYMBOLS_PATTERN = r'(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)'
229
+
230
+ # 语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
231
+ # 언어 필터 그룹 기능을 사용하면 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
232
+ # 言語フィルターグループ機能では、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
233
+ # The language filter group function allows you to specify reserved languages.
234
+ # Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.
235
+ # 排名越前,优先级越高,The higher the ranking, the higher the priority,ランキングが上位になるほど、優先度が高くなります。
236
+
237
+ # 系统默认过滤器。System default filter。(ISO 639-1 codes given)
238
+ # ----------------------------------------------------------------------------------------------------------------------------------
239
+ # "zh"中文=Chinese ,"en"英语=English ,"ja"日语=Japanese ,"ko"韩语=Korean ,"fr"法语=French ,"vi"越南语=Vietnamese , "ru"俄语=Russian
240
+ # "th"泰语=Thai
241
+ # ----------------------------------------------------------------------------------------------------------------------------------
242
+ self.DEFAULT_FILTERS = ["zh", "ja", "ko", "en"]
243
+
244
+ # 用户可自定义过滤器。User-defined filters
245
+ self.Langfilters = self.DEFAULT_FILTERS[:] # 创建副本
246
+
247
+ # 合并文本
248
+ self.isLangMerge = True
249
+
250
+ # 试验性支持:您可自定义添加:"fr"法语 , "vi"越南语。Experimental: You can customize to add: "fr" French, "vi" Vietnamese.
251
+ # 请使用API启用:self.setfilters(["zh", "en", "ja", "ko", "fr", "vi" , "ru" , "th"]) # 您可自定义添加,如:"fr"法语 , "vi"越南语。
252
+
253
+ # 预览版功能,自动启用或禁用,无需设置
254
+ # Preview feature, automatically enabled or disabled, no settings required
255
+ self.EnablePreview = False
256
+
257
+ # 除此以外,它支持简写过滤器,只需按不同语种任意组合即可。
258
+ # In addition to that, it supports abbreviation filters, allowing for any combination of different languages.
259
+ # 示例:您可以任意指定多种组合,进行过滤
260
+ # Example: You can specify any combination to filter
261
+
262
+ # 中/日语言优先级阀值(评分范围为 0 ~ 1):评分低于设定阀值 <0.89 时,启用 filters 中的优先级。\n
263
+ # 중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
264
+ # 中国語/日本語の優先度しきい値(スコア範囲0〜1):スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
265
+ # Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89. \n
266
+ # Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
267
+ self.LangPriorityThreshold = 0.89
268
+
269
+ # Langfilters = ["zh"] # 按中文识别
270
+ # Langfilters = ["en"] # 按英文识别
271
+ # Langfilters = ["ja"] # 按日文识别
272
+ # Langfilters = ["ko"] # 按韩文识别
273
+ # Langfilters = ["zh_ja"] # 中日混合识别
274
+ # Langfilters = ["zh_en"] # 中英混合识别
275
+ # Langfilters = ["ja_en"] # 日英混合识别
276
+ # Langfilters = ["zh_ko"] # 中韩混合识别
277
+ # Langfilters = ["ja_ko"] # 日韩混合识别
278
+ # Langfilters = ["en_ko"] # 英韩混合识别
279
+ # Langfilters = ["zh_ja_en"] # 中日英混合识别
280
+ # Langfilters = ["zh_ja_en_ko"] # 中日英韩混合识别
281
+
282
+ # 更多过滤组合,请您随意。。。For more filter combinations, please feel free to......
283
+ # より多くのフィルターの組み合わせ、お気軽に。。。더 많은 필터 조합을 원하시면 자유롭게 해주세요. .....
284
+
285
+ # 可选保留:支持中文数字拼音格式,更方便前端实现拼音音素修改和推理,默认关闭 False 。
286
+ # 开启后 True ,括号内的数字拼音格式均保留,并识别输出为:"zh"中文。
287
+ self.keepPinyin = False
288
+
289
+ # DEFINITION
290
+ self.PARSE_TAG = re.compile(r'(⑥\$*\d+[\d]{6,}⑥)')
291
+
292
+ self.LangSSML = LangSSML()
293
+
294
+ def _clears(self):
295
+ self._text_cache = None
296
+ self._text_lasts = None
297
+ self._text_langs = None
298
+ self._text_waits = None
299
+ self._lang_count = None
300
+ self._lang_eos = None
301
+
302
+ def _is_english_word(self, word):
303
+ return bool(re.match(r'^[a-zA-Z]+$', word))
304
+
305
+ def _is_chinese(self, word):
306
+ for char in word:
307
+ if '\u4e00' <= char <= '\u9fff':
308
+ return True
309
+ return False
310
+
311
+ def _is_japanese_kana(self, word):
312
+ pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]+')
313
+ matches = pattern.findall(word)
314
+ return len(matches) > 0
315
+
316
+ def _insert_english_uppercase(self, word):
317
+ modified_text = re.sub(r'(?<!\b)([A-Z])', r' \1', word)
318
+ modified_text = modified_text.strip('-')
319
+ return modified_text + " "
320
+
321
+ def _split_camel_case(self, word):
322
+ return re.sub(r'(?<!^)(?=[A-Z])', ' ', word)
323
+
324
+ def _statistics(self, language, text):
325
+ # Language word statistics:
326
+ # Chinese characters usually occupy double bytes
327
+ if self._lang_count is None or not isinstance(self._lang_count, defaultdict):
328
+ self._lang_count = defaultdict(int)
329
+ lang_count = self._lang_count
330
+ if not "|" in language:
331
+ lang_count[language] += int(len(text)*2) if language == "zh" else len(text)
332
+ self._lang_count = lang_count
333
+
334
+ def _clear_text_number(self, text):
335
+ if text == "\n":return text,False # Keep Line Breaks
336
+ clear_text = re.sub(r'([^\w\s]+)','',re.sub(r'\n+','',text)).strip()
337
+ is_number = len(re.sub(re.compile(r'(\d+)'),'',clear_text)) == 0
338
+ return clear_text,is_number
339
+
340
+ def _saveData(self, words,language:str,text:str,score:float,symbol=None):
341
+ # Pre-detection
342
+ clear_text , is_number = self._clear_text_number(text)
343
+ # Merge the same language and save the results
344
+ preData = words[-1] if len(words) > 0 else None
345
+ if symbol is not None:pass
346
+ elif preData is not None and preData["symbol"] is None:
347
+ if len(clear_text) == 0:language = preData["lang"]
348
+ elif is_number == True:language = preData["lang"]
349
+ _ , pre_is_number = self._clear_text_number(preData["text"])
350
+ if (preData["lang"] == language):
351
+ self._statistics(preData["lang"],text)
352
+ text = preData["text"] + text
353
+ preData["text"] = text
354
+ return preData
355
+ elif pre_is_number == True:
356
+ text = f'{preData["text"]}{text}'
357
+ words.pop()
358
+ elif is_number == True:
359
+ priority_language = self._get_filters_string()[:2]
360
+ if priority_language in "ja-zh-en-ko-fr-vi":language = priority_language
361
+ data = {"lang":language,"text": text,"score":score,"symbol":symbol}
362
+ filters = self.Langfilters
363
+ if filters is None or len(filters) == 0 or "?" in language or \
364
+ language in filters or language in filters[0] or \
365
+ filters[0] == "*" or filters[0] in "alls-mixs-autos":
366
+ words.append(data)
367
+ self._statistics(data["lang"],data["text"])
368
+ return data
369
+
370
+ def _addwords(self, words,language,text,score,symbol=None):
371
+ if text == "\n":pass # Keep Line Breaks
372
+ elif text is None or len(text.strip()) == 0:return True
373
+ if language is None:language = ""
374
+ language = language.lower()
375
+ if language == 'en':text = self._insert_english_uppercase(text)
376
+ # text = re.sub(r'[(())]', ',' , text) # Keep it.
377
+ text_waits = self._text_waits
378
+ ispre_waits = len(text_waits)>0
379
+ preResult = text_waits.pop() if ispre_waits else None
380
+ if preResult is None:preResult = words[-1] if len(words) > 0 else None
381
+ if preResult and ("|" in preResult["lang"]):
382
+ pre_lang = preResult["lang"]
383
+ if language in pre_lang:preResult["lang"] = language = language.split("|")[0]
384
+ else:preResult["lang"]=pre_lang.split("|")[0]
385
+ if ispre_waits:preResult = self._saveData(words,preResult["lang"],preResult["text"],preResult["score"],preResult["symbol"])
386
+ pre_lang = preResult["lang"] if preResult else None
387
+ if ("|" in language) and (pre_lang and not pre_lang in language and not "…" in language):language = language.split("|")[0]
388
+ if "|" in language:self._text_waits.append({"lang":language,"text": text,"score":score,"symbol":symbol})
389
+ else:self._saveData(words,language,text,score,symbol)
390
+ return False
391
+
392
+ def _get_prev_data(self, words):
393
+ data = words[-1] if words and len(words) > 0 else None
394
+ if data:return (data["lang"] , data["text"])
395
+ return (None,"")
396
+
397
+ def _match_ending(self, input , index):
398
+ if input is None or len(input) == 0:return False,None
399
+ input = re.sub(r'\s+', '', input)
400
+ if len(input) == 0 or abs(index) > len(input):return False,None
401
+ ending_pattern = re.compile(r'([「」“”‘’"\'::。.!!?.?])')
402
+ return ending_pattern.match(input[index]),input[index]
403
+
404
+ def _cleans_text(self, cleans_text):
405
+ cleans_text = re.sub(r'(.*?)([^\w]+)', r'\1 ', cleans_text)
406
+ cleans_text = re.sub(r'(.)\1+', r'\1', cleans_text)
407
+ return cleans_text.strip()
408
+
409
+ def _mean_processing(self, text:str):
410
+ if text is None or (text.strip()) == "":return None , 0.0
411
+ arrs = self._split_camel_case(text).split(" ")
412
+ langs = []
413
+ for t in arrs:
414
+ if len(t.strip()) <= 3:continue
415
+ language, score = self.langid.classify(t)
416
+ langs.append({"lang":language})
417
+ if len(langs) == 0:return None , 0.0
418
+ return Counter([item['lang'] for item in langs]).most_common(1)[0][0],1.0
419
+
420
+ def _lang_classify(self, cleans_text):
421
+ language, score = self.langid.classify(cleans_text)
422
+ # fix: Huggingface is np.float32
423
+ if score is not None and isinstance(score, np.generic) and hasattr(score,"item"):
424
+ score = score.item()
425
+ score = round(score , 3)
426
+ return language, score
427
+
428
+ def _get_filters_string(self):
429
+ filters = self.Langfilters
430
+ return "-".join(filters).lower().strip() if filters is not None else ""
431
+
432
+ def _parse_language(self, words , segment):
433
+ LANG_JA = "ja"
434
+ LANG_ZH = "zh"
435
+ LANG_ZH_JA = f'{LANG_ZH}|{LANG_JA}'
436
+ LANG_JA_ZH = f'{LANG_JA}|{LANG_ZH}'
437
+ language = LANG_ZH
438
+ regex_pattern = re.compile(r'([^\w\s]+)')
439
+ lines = regex_pattern.split(segment)
440
+ lines_max = len(lines)
441
+ LANG_EOS =self._lang_eos
442
+ for index, text in enumerate(lines):
443
+ if len(text) == 0:continue
444
+ EOS = index >= (lines_max - 1)
445
+ nextId = index + 1
446
+ nextText = lines[nextId] if not EOS else ""
447
+ nextPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',nextText)).strip()) == 0
448
+ textPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',text)).strip()) == 0
449
+ if not EOS and (textPunc == True or ( len(nextText.strip()) >= 0 and nextPunc == True)):
450
+ lines[nextId] = f'{text}{nextText}'
451
+ continue
452
+ number_tags = re.compile(r'(⑥\d{6,}⑥)')
453
+ cleans_text = re.sub(number_tags, '' ,text)
454
+ cleans_text = re.sub(r'\d+', '' ,cleans_text)
455
+ cleans_text = self._cleans_text(cleans_text)
456
+ # fix:Langid's recognition of short sentences is inaccurate, and it is spliced longer.
457
+ if not EOS and len(cleans_text) <= 2:
458
+ lines[nextId] = f'{text}{nextText}'
459
+ continue
460
+ language,score = self._lang_classify(cleans_text)
461
+ prev_language , prev_text = self._get_prev_data(words)
462
+ if language != LANG_ZH and all('\u4e00' <= c <= '\u9fff' for c in re.sub(r'\s','',cleans_text)):language,score = LANG_ZH,1
463
+ if len(cleans_text) <= 5 and self._is_chinese(cleans_text):
464
+ filters_string = self._get_filters_string()
465
+ if score < self.LangPriorityThreshold and len(filters_string) > 0:
466
+ index_ja , index_zh = filters_string.find(LANG_JA) , filters_string.find(LANG_ZH)
467
+ if index_ja != -1 and index_ja < index_zh:language = LANG_JA
468
+ elif index_zh != -1 and index_zh < index_ja:language = LANG_ZH
469
+ if self._is_japanese_kana(cleans_text):language = LANG_JA
470
+ elif len(cleans_text) > 2 and score > 0.90:pass
471
+ elif EOS and LANG_EOS:language = LANG_ZH if len(cleans_text) <= 1 else language
472
+ else:
473
+ LANG_UNKNOWN = LANG_ZH_JA if language == LANG_ZH or (len(cleans_text) <=2 and prev_language == LANG_ZH) else LANG_JA_ZH
474
+ match_end,match_char = self._match_ending(text, -1)
475
+ referen = prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language if prev_language else False
476
+ if match_char in "。.": language = prev_language if referen and len(words) > 0 else language
477
+ else:language = f"{LANG_UNKNOWN}|…"
478
+ text,*_ = re.subn(number_tags , self._restore_number , text )
479
+ self._addwords(words,language,text,score)
480
+
481
+ # ----------------------------------------------------------
482
+ # 【SSML】中文数字处理:Chinese Number Processing (SSML support)
483
+ # 这里默认都是中文,用于处理 SSML 中文标签。当然可以支持任意语言,例如:
484
+ # The default here is Chinese, which is used to process SSML Chinese tags. Of course, any language can be supported, for example:
485
+ # 中文电话号码:<telephone>1234567</telephone>
486
+ # 中文数字号码:<number>1234567</number>
487
+ def _process_symbol_SSML(self, words,data):
488
+ tag , match = data
489
+ language = SSML = match[1]
490
+ text = match[2]
491
+ score = 1.0
492
+ if SSML == "telephone":
493
+ # 中文-电话号码
494
+ language = "zh"
495
+ text = self.LangSSML.to_chinese_telephone(text)
496
+ elif SSML == "number":
497
+ # 中文-数字读法
498
+ language = "zh"
499
+ text = self.LangSSML.to_chinese_number(text)
500
+ elif SSML == "currency":
501
+ # 中文-按金额发音
502
+ language = "zh"
503
+ text = self.LangSSML.to_chinese_currency(text)
504
+ elif SSML == "date":
505
+ # 中文-按金额发音
506
+ language = "zh"
507
+ text = self.LangSSML.to_chinese_date(text)
508
+ self._addwords(words,language,text,score,SSML)
509
+
510
+ # ----------------------------------------------------------
511
+ def _restore_number(self, matche):
512
+ value = matche.group(0)
513
+ text_cache = self._text_cache
514
+ if value in text_cache:
515
+ process , data = text_cache[value]
516
+ tag , match = data
517
+ value = match
518
+ return value
519
+
520
+ def _pattern_symbols(self, item , text):
521
+ if text is None:return text
522
+ tag , pattern , process = item
523
+ matches = pattern.findall(text)
524
+ if len(matches) == 1 and "".join(matches[0]) == text:
525
+ return text
526
+ for i , match in enumerate(matches):
527
+ key = f"⑥{tag}{i:06d}⑥"
528
+ text = re.sub(pattern , key , text , count=1)
529
+ self._text_cache[key] = (process , (tag , match))
530
+ return text
531
+
532
+ def _process_symbol(self, words,data):
533
+ tag , match = data
534
+ language = match[1]
535
+ text = match[2]
536
+ score = 1.0
537
+ filters = self._get_filters_string()
538
+ if language not in filters:
539
+ self._process_symbol_SSML(words,data)
540
+ else:
541
+ self._addwords(words,language,text,score,True)
542
+
543
+ def _process_english(self, words,data):
544
+ tag , match = data
545
+ text = match[0]
546
+ filters = self._get_filters_string()
547
+ priority_language = filters[:2]
548
+ # Preview feature, other language segmentation processing
549
+ enablePreview = self.EnablePreview
550
+ if enablePreview == True:
551
+ # Experimental: Other language support
552
+ regex_pattern = re.compile(r'(.*?[。.??!!]+[\n]{,1})')
553
+ lines = regex_pattern.split(text)
554
+ for index , text in enumerate(lines):
555
+ if len(text.strip()) == 0:continue
556
+ cleans_text = self._cleans_text(text)
557
+ language,score = self._lang_classify(cleans_text)
558
+ if language not in filters:
559
+ language,score = self._mean_processing(cleans_text)
560
+ if language is None or score <= 0.0:continue
561
+ elif language in filters:pass # pass
562
+ elif score >= 0.95:continue # High score, but not in the filter, excluded.
563
+ elif score <= 0.15 and filters[:2] == "fr":language = priority_language
564
+ else:language = "en"
565
+ self._addwords(words,language,text,score)
566
+ else:
567
+ # Default is English
568
+ language, score = "en", 1.0
569
+ self._addwords(words,language,text,score)
570
+
571
+ def _process_Russian(self, words,data):
572
+ tag , match = data
573
+ text = match[0]
574
+ language = "ru"
575
+ score = 1.0
576
+ self._addwords(words,language,text,score)
577
+
578
+ def _process_Thai(self, words,data):
579
+ tag , match = data
580
+ text = match[0]
581
+ language = "th"
582
+ score = 1.0
583
+ self._addwords(words,language,text,score)
584
+
585
+ def _process_korean(self, words,data):
586
+ tag , match = data
587
+ text = match[0]
588
+ language = "ko"
589
+ score = 1.0
590
+ self._addwords(words,language,text,score)
591
+
592
+ def _process_quotes(self, words,data):
593
+ tag , match = data
594
+ text = "".join(match)
595
+ childs = self.PARSE_TAG.findall(text)
596
+ if len(childs) > 0:
597
+ self._process_tags(words , text , False)
598
+ else:
599
+ cleans_text = self._cleans_text(match[1])
600
+ if len(cleans_text) <= 5:
601
+ self._parse_language(words,text)
602
+ else:
603
+ language,score = self._lang_classify(cleans_text)
604
+ self._addwords(words,language,text,score)
605
+
606
+ def _process_pinyin(self, words,data):
607
+ tag , match = data
608
+ text = match
609
+ language = "zh"
610
+ score = 1.0
611
+ self._addwords(words,language,text,score)
612
+
613
+ def _process_number(self, words,data): # "$0" process only
614
+ """
615
+ Numbers alone cannot accurately identify language.
616
+ Because numbers are universal in all languages.
617
+ So it won't be executed here, just for testing.
618
+ """
619
+ tag , match = data
620
+ language = words[0]["lang"] if len(words) > 0 else "zh"
621
+ text = match
622
+ score = 0.0
623
+ self._addwords(words,language,text,score)
624
+
625
+ def _process_tags(self, words , text , root_tag):
626
+ text_cache = self._text_cache
627
+ segments = re.split(self.PARSE_TAG, text)
628
+ segments_len = len(segments) - 1
629
+ for index , text in enumerate(segments):
630
+ if root_tag:self._lang_eos = index >= segments_len
631
+ if self.PARSE_TAG.match(text):
632
+ process , data = text_cache[text]
633
+ if process:process(words , data)
634
+ else:
635
+ self._parse_language(words , text)
636
+ return words
637
+
638
+ def _merge_results(self, words):
639
+ new_word = []
640
+ for index , cur_data in enumerate(words):
641
+ if "symbol" in cur_data:del cur_data["symbol"]
642
+ if index == 0:new_word.append(cur_data)
643
+ else:
644
+ pre_data = new_word[-1]
645
+ if cur_data["lang"] == pre_data["lang"]:
646
+ pre_data["text"] = f'{pre_data["text"]}{cur_data["text"]}'
647
+ else:new_word.append(cur_data)
648
+ return new_word
649
+
650
+ def _parse_symbols(self, text):
651
+ TAG_NUM = "00" # "00" => default channels , "$0" => testing channel
652
+ TAG_S1,TAG_S2,TAG_P1,TAG_P2,TAG_EN,TAG_KO,TAG_RU,TAG_TH = "$1" ,"$2" ,"$3" ,"$4" ,"$5" ,"$6" ,"$7","$8"
653
+ TAG_BASE = re.compile(fr'(([【《((“‘"\']*[LANGUAGE]+[\W\s]*)+)')
654
+ # Get custom language filter
655
+ filters = self.Langfilters
656
+ filters = filters if filters is not None else ""
657
+ # =======================================================================================================
658
+ # Experimental: Other language support.Thử nghiệm: Hỗ trợ ngôn ngữ khác.Expérimental : prise en charge d’autres langues.
659
+ # 相关语言字符如有缺失,熟悉相关语言的朋友,可以提交把缺失的发音符号补全。
660
+ # If relevant language characters are missing, friends who are familiar with the relevant languages can submit a submission to complete the missing pronunciation symbols.
661
+ # S'il manque des caractères linguistiques pertinents, les amis qui connaissent les langues concernées peuvent soumettre une soumission pour compléter les symboles de prononciation manquants.
662
+ # Nếu thiếu ký tự ngôn ngữ liên quan, những người bạn quen thuộc với ngôn ngữ liên quan có thể gửi bài để hoàn thành các ký hiệu phát âm còn thiếu.
663
+ # -------------------------------------------------------------------------------------------------------
664
+ # Preview feature, other language support
665
+ enablePreview = self.EnablePreview
666
+ if "fr" in filters or \
667
+ "vi" in filters:enablePreview = True
668
+ self.EnablePreview = enablePreview
669
+ # 实验性:法语字符支持。Prise en charge des caractères français
670
+ RE_FR = "" if not enablePreview else "àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ"
671
+ # 实验性:越南语字符支持。Hỗ trợ ký tự tiếng Việt
672
+ RE_VI = "" if not enablePreview else "đơưăáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựôâêơưỷỹ"
673
+ # -------------------------------------------------------------------------------------------------------
674
+ # Basic options:
675
+ process_list = [
676
+ ( TAG_S1 , re.compile(self.SYMBOLS_PATTERN) , self._process_symbol ), # Symbol Tag
677
+ ( TAG_KO , re.compile(re.sub(r'LANGUAGE',f'\uac00-\ud7a3',TAG_BASE.pattern)) , self._process_korean ), # Korean words
678
+ ( TAG_TH , re.compile(re.sub(r'LANGUAGE',f'\u0E00-\u0E7F',TAG_BASE.pattern)) , self._process_Thai ), # Thai words support.
679
+ ( TAG_RU , re.compile(re.sub(r'LANGUAGE',f'А-Яа-яЁё',TAG_BASE.pattern)) , self._process_Russian ), # Russian words support.
680
+ ( TAG_NUM , re.compile(r'(\W*\d+\W+\d*\W*\d*)') , self._process_number ), # Number words, Universal in all languages, Ignore it.
681
+ ( TAG_EN , re.compile(re.sub(r'LANGUAGE',f'a-zA-Z{RE_FR}{RE_VI}',TAG_BASE.pattern)) , self._process_english ), # English words + Other language support.
682
+ ( TAG_P1 , re.compile(r'(["\'])(.*?)(\1)') , self._process_quotes ), # Regular quotes
683
+ ( TAG_P2 , re.compile(r'([\n]*[【《((“‘])([^【《((“‘’”))》】]{3,})([’”))》】][\W\s]*[\n]{,1})') , self._process_quotes ), # Special quotes, There are left and right.
684
+ ]
685
+ # Extended options: Default False
686
+ if self.keepPinyin == True:process_list.insert(1 ,
687
+ ( TAG_S2 , re.compile(r'([\(({](?:\s*\w*\d\w*\s*)+[})\)])') , self._process_pinyin ), # Chinese Pinyin Tag.
688
+ )
689
+ # -------------------------------------------------------------------------------------------------------
690
+ words = []
691
+ lines = re.findall(r'.*\n*', re.sub(self.PARSE_TAG, '' ,text))
692
+ for index , text in enumerate(lines):
693
+ if len(text.strip()) == 0:continue
694
+ self._lang_eos = False
695
+ self._text_cache = {}
696
+ for item in process_list:
697
+ text = self._pattern_symbols(item , text)
698
+ cur_word = self._process_tags([] , text , True)
699
+ if len(cur_word) == 0:continue
700
+ cur_data = cur_word[0] if len(cur_word) > 0 else None
701
+ pre_data = words[-1] if len(words) > 0 else None
702
+ if cur_data and pre_data and cur_data["lang"] == pre_data["lang"] \
703
+ and cur_data["symbol"] == False and pre_data["symbol"] :
704
+ cur_data["text"] = f'{pre_data["text"]}{cur_data["text"]}'
705
+ words.pop()
706
+ words += cur_word
707
+ if self.isLangMerge == True:words = self._merge_results(words)
708
+ lang_count = self._lang_count
709
+ if lang_count and len(lang_count) > 0:
710
+ lang_count = dict(sorted(lang_count.items(), key=lambda x: x[1], reverse=True))
711
+ lang_count = list(lang_count.items())
712
+ self._lang_count = lang_count
713
+ return words
714
+
715
+ def setfilters(self, filters):
716
+ # 当过滤器更改时,清除缓存
717
+ # 필터가 변경되면 캐시를 지웁니다.
718
+ # フィルタが変更されると、キャッシュがクリアされます
719
+ # When the filter changes, clear the cache
720
+ if self.Langfilters != filters:
721
+ self._clears()
722
+ self.Langfilters = filters
723
+
724
+ def getfilters(self):
725
+ return self.Langfilters
726
+
727
+ def setPriorityThreshold(self, threshold:float):
728
+ self.LangPriorityThreshold = threshold
729
+
730
+ def getPriorityThreshold(self):
731
+ return self.LangPriorityThreshold
732
+
733
+ def getCounts(self):
734
+ lang_count = self._lang_count
735
+ if lang_count is not None:return lang_count
736
+ text_langs = self._text_langs
737
+ if text_langs is None or len(text_langs) == 0:return [("zh",0)]
738
+ lang_counts = defaultdict(int)
739
+ for d in text_langs:lang_counts[d['lang']] += int(len(d['text'])*2) if d['lang'] == "zh" else len(d['text'])
740
+ lang_counts = dict(sorted(lang_counts.items(), key=lambda x: x[1], reverse=True))
741
+ lang_counts = list(lang_counts.items())
742
+ self._lang_count = lang_counts
743
+ return lang_counts
744
+
745
+ def getTexts(self, text:str):
746
+ if text is None or len(text.strip()) == 0:
747
+ self._clears()
748
+ return []
749
+ # lasts
750
+ text_langs = self._text_langs
751
+ if self._text_lasts == text and text_langs is not None:return text_langs
752
+ # parse
753
+ self._text_waits = []
754
+ self._lang_count = None
755
+ self._text_lasts = text
756
+ text = self._parse_symbols(text)
757
+ self._text_langs = text
758
+ return text
759
+
760
+ def classify(self, text:str):
761
+ return self.getTexts(text)
762
+
763
+ def printList(langlist):
764
+ """
765
+ 功能:打印数组结果
766
+ 기능: 어레이 결과 인쇄
767
+ 機能:配列結果を印刷
768
+ Function: Print array results
769
+ """
770
+ print("\n===================【打印结果】===================")
771
+ if langlist is None or len(langlist) == 0:
772
+ print("无内容结果,No content result")
773
+ return
774
+ for line in langlist:
775
+ print(line)
776
+ pass
777
+
778
+
779
+
780
+ def main():
781
+
782
+ # -----------------------------------
783
+ # 更新日志:新版本分词更加精准。
784
+ # Changelog: The new version of the word segmentation is more accurate.
785
+ # チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。
786
+ # Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다.
787
+ # -----------------------------------
788
+
789
+ # 输入示例1:(包含日文,中文)Input Example 1: (including Japanese, Chinese)
790
+ # text = "“昨日は雨が降った,音楽、映画。。。”你今天学习日语了吗?春は桜の季節です。语种分词是语音合成必不可少的环节。言語分詞は音声合成に欠かせない環節である!"
791
+
792
+ # 输入示例2:(包含日文,中文)Input Example 1: (including Japanese, Chinese)
793
+ # text = "欢迎来玩。東京,は日本の首都です。欢迎来玩. 太好了!"
794
+
795
+ # 输入示例3:(包含日文,中文)Input Example 1: (including Japanese, Chinese)
796
+ # text = "明日、私たちは海辺にバカンスに行きます。你会说日语吗:“中国語、話せますか” 你的日语真好啊!"
797
+
798
+
799
+ # 输入示例4:(包含日文,中文,韩语,英文)Input Example 4: (including Japanese, Chinese, Korean, English)
800
+ # text = "你的名字叫<ja>佐々木?<ja>吗?韩语中的안녕 오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型和三款Apple Watch等一系列新品,这次的iPad Air采用了LCD屏幕"
801
+
802
+
803
+ # 试验性支持:"fr"法语 , "vi"越南语 , "ru"俄语 , "th"泰语。Experimental: Other language support.
804
+ langsegment = LangSegment()
805
+ langsegment.setfilters(["fr", "vi" , "ja", "zh", "ko", "en" , "ru" , "th"])
806
+ text = """
807
+ 我喜欢在雨天里听音乐。
808
+ I enjoy listening to music on rainy days.
809
+ 雨の日に音楽を聴くのが好きです。
810
+ 비 오는 날에 음악을 듣는 것을 즐깁니다。
811
+ J'aime écouter de la musique les jours de pluie.
812
+ Tôi thích nghe nhạc vào những ngày mưa.
813
+ Мне нравится слушать музыку в дождливую погоду.
814
+ ฉันชอบฟังเพลงในวันที่ฝนตก
815
+ """
816
+
817
+
818
+
819
+ # 进行分词:(接入TTS项目仅需一行代码调用)Segmentation: (Only one line of code is required to access the TTS project)
820
+ langlist = langsegment.getTexts(text)
821
+ printList(langlist)
822
+
823
+
824
+ # 语种统计:Language statistics:
825
+ print("\n===================【语种统计】===================")
826
+ # 获取所有语种数组结果,根据内容字数降序排列
827
+ # Get the array results in all languages, sorted in descending order according to the number of content words
828
+ langCounts = langsegment.getCounts()
829
+ print(langCounts , "\n")
830
+
831
+ # 根据结果获取内容的主要语种 (语言,字数含标点)
832
+ # Get the main language of content based on the results (language, word count including punctuation)
833
+ lang , count = langCounts[0]
834
+ print(f"输入内容的主要语言为 = {lang} ,字数 = {count}")
835
+ print("==================================================\n")
836
+
837
+
838
+ # 分词输出:lang=语言,text=内容。Word output: lang = language, text = content
839
+ # ===================【打印结果】===================
840
+ # {'lang': 'zh', 'text': '你的名字叫'}
841
+ # {'lang': 'ja', 'text': '佐々木?'}
842
+ # {'lang': 'zh', 'text': '吗?韩语中的'}
843
+ # {'lang': 'ko', 'text': '안녕 오빠'}
844
+ # {'lang': 'zh', 'text': '读什么呢?'}
845
+ # {'lang': 'ja', 'text': 'あなたの体育の先生は誰ですか?'}
846
+ # {'lang': 'zh', 'text': ' 此次发布会带来了四款'}
847
+ # {'lang': 'en', 'text': 'i Phone '}
848
+ # {'lang': 'zh', 'text': '15系列机型和三款'}
849
+ # {'lang': 'en', 'text': 'Apple Watch '}
850
+ # {'lang': 'zh', 'text': '等一系列新品,这次的'}
851
+ # {'lang': 'en', 'text': 'i Pad Air '}
852
+ # {'lang': 'zh', 'text': '采用了'}
853
+ # {'lang': 'en', 'text': 'L C D '}
854
+ # {'lang': 'zh', 'text': '屏幕'}
855
+ # ===================【语种统计】===================
856
+
857
+ # ===================【语种统计】===================
858
+ # [('zh', 51), ('ja', 19), ('en', 18), ('ko', 5)]
859
+
860
+ # 输入内容的主要语言为 = zh ,字数 = 51
861
+ # ==================================================
862
+ # The main language of the input content is = zh, word count = 51
863
+
864
+
865
+ if __name__ == "__main__":
866
+ main()
language_segmentation/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .LangSegment import LangSegment
2
+
3
+
4
+ # release
5
+ __version__ = '0.3.5'
6
+
7
+
8
+ # develop
9
+ __develop__ = 'dev-0.0.1'
language_segmentation/utils/__init__.py ADDED
File without changes
language_segmentation/utils/num.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # Digital processing from GPT_SoVITS num.py (thanks)
15
+ """
16
+ Rules to verbalize numbers into Chinese characters.
17
+ https://zh.wikipedia.org/wiki/中文数字#現代中文
18
+ """
19
+
20
+ import re
21
+ from collections import OrderedDict
22
+ from typing import List
23
+
24
+ DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
25
+ UNITS = OrderedDict({
26
+ 1: '十',
27
+ 2: '百',
28
+ 3: '千',
29
+ 4: '万',
30
+ 8: '亿',
31
+ })
32
+
33
+ COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
34
+
35
+ # 分数表达式
36
+ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
37
+
38
+
39
+ def replace_frac(match) -> str:
40
+ """
41
+ Args:
42
+ match (re.Match)
43
+ Returns:
44
+ str
45
+ """
46
+ sign = match.group(1)
47
+ nominator = match.group(2)
48
+ denominator = match.group(3)
49
+ sign: str = "负" if sign else ""
50
+ nominator: str = num2str(nominator)
51
+ denominator: str = num2str(denominator)
52
+ result = f"{sign}{denominator}分之{nominator}"
53
+ return result
54
+
55
+
56
+ # 百分数表达式
57
+ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
58
+
59
+
60
+ def replace_percentage(match) -> str:
61
+ """
62
+ Args:
63
+ match (re.Match)
64
+ Returns:
65
+ str
66
+ """
67
+ sign = match.group(1)
68
+ percent = match.group(2)
69
+ sign: str = "负" if sign else ""
70
+ percent: str = num2str(percent)
71
+ result = f"{sign}百分之{percent}"
72
+ return result
73
+
74
+
75
+ # 整数表达式
76
+ # 带负号的整数 -10
77
+ RE_INTEGER = re.compile(r'(-)' r'(\d+)')
78
+
79
+
80
+ def replace_negative_num(match) -> str:
81
+ """
82
+ Args:
83
+ match (re.Match)
84
+ Returns:
85
+ str
86
+ """
87
+ sign = match.group(1)
88
+ number = match.group(2)
89
+ sign: str = "负" if sign else ""
90
+ number: str = num2str(number)
91
+ result = f"{sign}{number}"
92
+ return result
93
+
94
+
95
+ # 编号-无符号整形
96
+ # 00078
97
+ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
98
+
99
+
100
+ def replace_default_num(match):
101
+ """
102
+ Args:
103
+ match (re.Match)
104
+ Returns:
105
+ str
106
+ """
107
+ number = match.group(0)
108
+ return verbalize_digit(number, alt_one=True)
109
+
110
+
111
+ # 加减乘除
112
+ # RE_ASMD = re.compile(
113
+ # r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
114
+ RE_ASMD = re.compile(
115
+ r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))')
116
+
117
+ asmd_map = {
118
+ '+': '加',
119
+ '-': '减',
120
+ '×': '乘',
121
+ '÷': '除',
122
+ '=': '等于'
123
+ }
124
+
125
+ def replace_asmd(match) -> str:
126
+ """
127
+ Args:
128
+ match (re.Match)
129
+ Returns:
130
+ str
131
+ """
132
+ result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
133
+ return result
134
+
135
+
136
+ # 次方专项
137
+ RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+')
138
+
139
+ power_map = {
140
+ '⁰': '0',
141
+ '¹': '1',
142
+ '²': '2',
143
+ '³': '3',
144
+ '⁴': '4',
145
+ '⁵': '5',
146
+ '⁶': '6',
147
+ '⁷': '7',
148
+ '⁸': '8',
149
+ '⁹': '9',
150
+ 'ˣ': 'x',
151
+ 'ʸ': 'y',
152
+ 'ⁿ': 'n'
153
+ }
154
+
155
+ def replace_power(match) -> str:
156
+ """
157
+ Args:
158
+ match (re.Match)
159
+ Returns:
160
+ str
161
+ """
162
+ power_num = ""
163
+ for m in match.group(0):
164
+ power_num += power_map[m]
165
+ result = "的" + power_num + "次方"
166
+ return result
167
+
168
+
169
+ # 数字表达式
170
+ # 纯小数
171
+ RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
172
+ # 正整数 + 量词
173
+ RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
174
+ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
175
+
176
+
177
+ def replace_positive_quantifier(match) -> str:
178
+ """
179
+ Args:
180
+ match (re.Match)
181
+ Returns:
182
+ str
183
+ """
184
+ number = match.group(1)
185
+ match_2 = match.group(2)
186
+ if match_2 == "+":
187
+ match_2 = "多"
188
+ match_2: str = match_2 if match_2 else ""
189
+ quantifiers: str = match.group(3)
190
+ number: str = num2str(number)
191
+ result = f"{number}{match_2}{quantifiers}"
192
+ return result
193
+
194
+
195
+ def replace_number(match) -> str:
196
+ """
197
+ Args:
198
+ match (re.Match)
199
+ Returns:
200
+ str
201
+ """
202
+ sign = match.group(1)
203
+ number = match.group(2)
204
+ pure_decimal = match.group(5)
205
+ if pure_decimal:
206
+ result = num2str(pure_decimal)
207
+ else:
208
+ sign: str = "负" if sign else ""
209
+ number: str = num2str(number)
210
+ result = f"{sign}{number}"
211
+ return result
212
+
213
+
214
+ # 范围表达式
215
+ # match.group(1) and match.group(8) are copy from RE_NUMBER
216
+
217
+ RE_RANGE = re.compile(
218
+ r"""
219
+ (?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
220
+ ((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
221
+ [-~] # 匹配范围分隔符
222
+ ((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
223
+ (?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
224
+ """, re.VERBOSE)
225
+
226
+
227
+ def replace_range(match) -> str:
228
+ """
229
+ Args:
230
+ match (re.Match)
231
+ Returns:
232
+ str
233
+ """
234
+ first, second = match.group(1), match.group(6)
235
+ first = RE_NUMBER.sub(replace_number, first)
236
+ second = RE_NUMBER.sub(replace_number, second)
237
+ result = f"{first}到{second}"
238
+ return result
239
+
240
+
241
+ # ~至表达式
242
+ RE_TO_RANGE = re.compile(
243
+ r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
244
+
245
+ def replace_to_range(match) -> str:
246
+ """
247
+ Args:
248
+ match (re.Match)
249
+ Returns:
250
+ str
251
+ """
252
+ result = match.group(0).replace('~', '至')
253
+ return result
254
+
255
+
256
+ def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
257
+ stripped = value_string.lstrip('0')
258
+ if len(stripped) == 0:
259
+ return []
260
+ elif len(stripped) == 1:
261
+ if use_zero and len(stripped) < len(value_string):
262
+ return [DIGITS['0'], DIGITS[stripped]]
263
+ else:
264
+ return [DIGITS[stripped]]
265
+ else:
266
+ largest_unit = next(
267
+ power for power in reversed(UNITS.keys()) if power < len(stripped))
268
+ first_part = value_string[:-largest_unit]
269
+ second_part = value_string[-largest_unit:]
270
+ return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
271
+ second_part)
272
+
273
+
274
+ def verbalize_cardinal(value_string: str) -> str:
275
+ if not value_string:
276
+ return ''
277
+
278
+ # 000 -> '零' , 0 -> '零'
279
+ value_string = value_string.lstrip('0')
280
+ if len(value_string) == 0:
281
+ return DIGITS['0']
282
+
283
+ result_symbols = _get_value(value_string)
284
+ # verbalized number starting with '一十*' is abbreviated as `十*`
285
+ if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
286
+ '1'] and result_symbols[1] == UNITS[1]:
287
+ result_symbols = result_symbols[1:]
288
+ return ''.join(result_symbols)
289
+
290
+
291
+ def verbalize_digit(value_string: str, alt_one=False) -> str:
292
+ result_symbols = [DIGITS[digit] for digit in value_string]
293
+ result = ''.join(result_symbols)
294
+ if alt_one:
295
+ result = result.replace("一", "幺")
296
+ return result
297
+
298
+
299
+ def num2str(value_string: str) -> str:
300
+ integer_decimal = value_string.split('.')
301
+ if len(integer_decimal) == 1:
302
+ integer = integer_decimal[0]
303
+ decimal = ''
304
+ elif len(integer_decimal) == 2:
305
+ integer, decimal = integer_decimal
306
+ else:
307
+ raise ValueError(
308
+ f"The value string: '${value_string}' has more than one point in it."
309
+ )
310
+
311
+ result = verbalize_cardinal(integer)
312
+
313
+ decimal = decimal.rstrip('0')
314
+ if decimal:
315
+ # '.22' is verbalized as '零点二二'
316
+ # '3.20' is verbalized as '三点二
317
+ result = result if result else "零"
318
+ result += '点' + verbalize_digit(decimal)
319
+ return result
320
+
321
+
322
+ if __name__ == "__main__":
323
+
324
+ text = ""
325
+ text = num2str(text)
326
+ print(text)
327
+ pass
models/ace_step_transformer.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from typing import Any, Dict, Optional, Tuple, List, Union
16
+
17
+ import torch
18
+ import torch.nn.functional as F
19
+ from torch import nn
20
+
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.utils import BaseOutput, is_torch_version
23
+ from diffusers.models.modeling_utils import ModelMixin
24
+ from diffusers.models.embeddings import TimestepEmbedding, Timesteps
25
+ from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
26
+
27
+
28
+ from .attention import LinearTransformerBlock, t2i_modulate
29
+ from .lyrics_utils.lyric_encoder import ConformerEncoder as LyricEncoder
30
+
31
+
32
+ def cross_norm(hidden_states, controlnet_input):
33
+ # input N x T x c
34
+ mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True)
35
+ mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True)
36
+ controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states
37
+ return controlnet_input
38
+
39
+
40
+ # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
41
+ class Qwen2RotaryEmbedding(nn.Module):
42
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
43
+ super().__init__()
44
+
45
+ self.dim = dim
46
+ self.max_position_embeddings = max_position_embeddings
47
+ self.base = base
48
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
49
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
50
+
51
+ # Build here to make `torch.jit.trace` work.
52
+ self._set_cos_sin_cache(
53
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
54
+ )
55
+
56
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
57
+ self.max_seq_len_cached = seq_len
58
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
59
+
60
+ freqs = torch.outer(t, self.inv_freq)
61
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
62
+ emb = torch.cat((freqs, freqs), dim=-1)
63
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
64
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
65
+
66
+ def forward(self, x, seq_len=None):
67
+ # x: [bs, num_attention_heads, seq_len, head_size]
68
+ if seq_len > self.max_seq_len_cached:
69
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
70
+
71
+ return (
72
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
73
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
74
+ )
75
+
76
+
77
+ class T2IFinalLayer(nn.Module):
78
+ """
79
+ The final layer of Sana.
80
+ """
81
+
82
+ def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256):
83
+ super().__init__()
84
+ self.norm_final = nn.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
85
+ self.linear = nn.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True)
86
+ self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5)
87
+ self.out_channels = out_channels
88
+ self.patch_size = patch_size
89
+
90
+ def unpatchfy(
91
+ self,
92
+ hidden_states: torch.Tensor,
93
+ width: int,
94
+ ):
95
+ # 4 unpatchify
96
+ new_height, new_width = 1, hidden_states.size(1)
97
+ hidden_states = hidden_states.reshape(
98
+ shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels)
99
+ ).contiguous()
100
+ hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
101
+ output = hidden_states.reshape(
102
+ shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1])
103
+ ).contiguous()
104
+ if width > new_width:
105
+ output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0)
106
+ elif width < new_width:
107
+ output = output[:, :, :, :width]
108
+ return output
109
+
110
+ def forward(self, x, t, output_length):
111
+ shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
112
+ x = t2i_modulate(self.norm_final(x), shift, scale)
113
+ x = self.linear(x)
114
+ # unpatchify
115
+ output = self.unpatchfy(x, output_length)
116
+ return output
117
+
118
+
119
+ class PatchEmbed(nn.Module):
120
+ """2D Image to Patch Embedding"""
121
+
122
+ def __init__(
123
+ self,
124
+ height=16,
125
+ width=4096,
126
+ patch_size=(16, 1),
127
+ in_channels=8,
128
+ embed_dim=1152,
129
+ bias=True,
130
+ ):
131
+ super().__init__()
132
+ patch_size_h, patch_size_w = patch_size
133
+ self.early_conv_layers = nn.Sequential(
134
+ nn.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias),
135
+ torch.nn.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True),
136
+ nn.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias)
137
+ )
138
+ self.patch_size = patch_size
139
+ self.height, self.width = height // patch_size_h, width // patch_size_w
140
+ self.base_size = self.width
141
+
142
+ def forward(self, latent):
143
+ # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size
144
+ latent = self.early_conv_layers(latent)
145
+ latent = latent.flatten(2).transpose(1, 2) # BCHW -> BNC
146
+ return latent
147
+
148
+
149
+ @dataclass
150
+ class Transformer2DModelOutput(BaseOutput):
151
+
152
+ sample: torch.FloatTensor
153
+ proj_losses: Optional[Tuple[Tuple[str, torch.Tensor]]] = None
154
+
155
+
156
+ class ACEStepTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
157
+ _supports_gradient_checkpointing = True
158
+
159
+ @register_to_config
160
+ def __init__(
161
+ self,
162
+ in_channels: Optional[int] = 8,
163
+ num_layers: int = 28,
164
+ inner_dim: int = 1536,
165
+ attention_head_dim: int = 64,
166
+ num_attention_heads: int = 24,
167
+ mlp_ratio: float = 4.0,
168
+ out_channels: int = 8,
169
+ max_position: int = 32768,
170
+ rope_theta: float = 1000000.0,
171
+ speaker_embedding_dim: int = 512,
172
+ text_embedding_dim: int = 768,
173
+ ssl_encoder_depths: List[int] = [9, 9],
174
+ ssl_names: List[str] = ["mert", "m-hubert"],
175
+ ssl_latent_dims: List[int] = [1024, 768],
176
+ lyric_encoder_vocab_size: int = 6681,
177
+ lyric_hidden_size: int = 1024,
178
+ patch_size: List[int] = [16, 1],
179
+ max_height: int = 16,
180
+ max_width: int = 4096,
181
+ **kwargs,
182
+ ):
183
+ super().__init__()
184
+
185
+ self.num_attention_heads = num_attention_heads
186
+ self.attention_head_dim = attention_head_dim
187
+ inner_dim = num_attention_heads * attention_head_dim
188
+ self.inner_dim = inner_dim
189
+ self.out_channels = out_channels
190
+ self.max_position = max_position
191
+ self.patch_size = patch_size
192
+
193
+ self.rope_theta = rope_theta
194
+
195
+ self.rotary_emb = Qwen2RotaryEmbedding(
196
+ dim=self.attention_head_dim,
197
+ max_position_embeddings=self.max_position,
198
+ base=self.rope_theta,
199
+ )
200
+
201
+ # 2. Define input layers
202
+ self.in_channels = in_channels
203
+
204
+ # 3. Define transformers blocks
205
+ self.transformer_blocks = nn.ModuleList(
206
+ [
207
+ LinearTransformerBlock(
208
+ dim=self.inner_dim,
209
+ num_attention_heads=self.num_attention_heads,
210
+ attention_head_dim=attention_head_dim,
211
+ mlp_ratio=mlp_ratio,
212
+ add_cross_attention=True,
213
+ add_cross_attention_dim=self.inner_dim,
214
+ )
215
+ for i in range(self.config.num_layers)
216
+ ]
217
+ )
218
+ self.num_layers = num_layers
219
+
220
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
221
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim)
222
+ self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(self.inner_dim, 6 * self.inner_dim, bias=True))
223
+
224
+ # speaker
225
+ self.speaker_embedder = nn.Linear(speaker_embedding_dim, self.inner_dim)
226
+
227
+ # genre
228
+ self.genre_embedder = nn.Linear(text_embedding_dim, self.inner_dim)
229
+
230
+ # lyric
231
+ self.lyric_embs = nn.Embedding(lyric_encoder_vocab_size, lyric_hidden_size)
232
+ self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0)
233
+ self.lyric_proj = nn.Linear(lyric_hidden_size, self.inner_dim)
234
+
235
+ projector_dim = 2 * self.inner_dim
236
+
237
+ self.projectors = nn.ModuleList([
238
+ nn.Sequential(
239
+ nn.Linear(self.inner_dim, projector_dim),
240
+ nn.SiLU(),
241
+ nn.Linear(projector_dim, projector_dim),
242
+ nn.SiLU(),
243
+ nn.Linear(projector_dim, ssl_dim),
244
+ ) for ssl_dim in ssl_latent_dims
245
+ ])
246
+
247
+ self.ssl_latent_dims = ssl_latent_dims
248
+ self.ssl_encoder_depths = ssl_encoder_depths
249
+
250
+ self.cosine_loss = torch.nn.CosineEmbeddingLoss(margin=0.0, reduction='mean')
251
+ self.ssl_names = ssl_names
252
+
253
+ self.proj_in = PatchEmbed(
254
+ height=max_height,
255
+ width=max_width,
256
+ patch_size=patch_size,
257
+ embed_dim=self.inner_dim,
258
+ bias=True,
259
+ )
260
+
261
+ self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels)
262
+ self.gradient_checkpointing = False
263
+
264
+ # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
265
+ def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
266
+ """
267
+ Sets the attention processor to use [feed forward
268
+ chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
269
+
270
+ Parameters:
271
+ chunk_size (`int`, *optional*):
272
+ The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
273
+ over each tensor of dim=`dim`.
274
+ dim (`int`, *optional*, defaults to `0`):
275
+ The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
276
+ or dim=1 (sequence length).
277
+ """
278
+ if dim not in [0, 1]:
279
+ raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
280
+
281
+ # By default chunk size is 1
282
+ chunk_size = chunk_size or 1
283
+
284
+ def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
285
+ if hasattr(module, "set_chunk_feed_forward"):
286
+ module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
287
+
288
+ for child in module.children():
289
+ fn_recursive_feed_forward(child, chunk_size, dim)
290
+
291
+ for module in self.children():
292
+ fn_recursive_feed_forward(module, chunk_size, dim)
293
+
294
+ def _set_gradient_checkpointing(self, module, value=False):
295
+ if hasattr(module, "gradient_checkpointing"):
296
+ module.gradient_checkpointing = value
297
+
298
+ def forward_lyric_encoder(
299
+ self,
300
+ lyric_token_idx: Optional[torch.LongTensor] = None,
301
+ lyric_mask: Optional[torch.LongTensor] = None,
302
+ ):
303
+ # N x T x D
304
+ lyric_embs = self.lyric_embs(lyric_token_idx)
305
+ prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1)
306
+ prompt_prenet_out = self.lyric_proj(prompt_prenet_out)
307
+ return prompt_prenet_out
308
+
309
+ def encode(
310
+ self,
311
+ encoder_text_hidden_states: Optional[torch.Tensor] = None,
312
+ text_attention_mask: Optional[torch.LongTensor] = None,
313
+ speaker_embeds: Optional[torch.FloatTensor] = None,
314
+ lyric_token_idx: Optional[torch.LongTensor] = None,
315
+ lyric_mask: Optional[torch.LongTensor] = None,
316
+ ):
317
+
318
+ bs = encoder_text_hidden_states.shape[0]
319
+ device = encoder_text_hidden_states.device
320
+
321
+ # speaker embedding
322
+ encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1)
323
+ speaker_mask = torch.ones(bs, 1, device=device)
324
+
325
+ # genre embedding
326
+ encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states)
327
+
328
+ # lyric
329
+ encoder_lyric_hidden_states = self.forward_lyric_encoder(
330
+ lyric_token_idx=lyric_token_idx,
331
+ lyric_mask=lyric_mask,
332
+ )
333
+
334
+ encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1)
335
+ encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1)
336
+ return encoder_hidden_states, encoder_hidden_mask
337
+
338
+ def decode(
339
+ self,
340
+ hidden_states: torch.Tensor,
341
+ attention_mask: torch.Tensor,
342
+ encoder_hidden_states: torch.Tensor,
343
+ encoder_hidden_mask: torch.Tensor,
344
+ timestep: Optional[torch.Tensor],
345
+ ssl_hidden_states: Optional[List[torch.Tensor]] = None,
346
+ output_length: int = 0,
347
+ block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
348
+ controlnet_scale: Union[float, torch.Tensor] = 1.0,
349
+ return_dict: bool = True,
350
+ ):
351
+
352
+ embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
353
+ temb = self.t_block(embedded_timestep)
354
+
355
+ hidden_states = self.proj_in(hidden_states)
356
+
357
+ # controlnet logic
358
+ if block_controlnet_hidden_states is not None:
359
+ control_condi = cross_norm(hidden_states, block_controlnet_hidden_states)
360
+ hidden_states = hidden_states + control_condi * controlnet_scale
361
+
362
+ inner_hidden_states = []
363
+
364
+ rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1])
365
+ encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1])
366
+
367
+ for index_block, block in enumerate(self.transformer_blocks):
368
+
369
+ if self.training and self.gradient_checkpointing:
370
+
371
+ def create_custom_forward(module, return_dict=None):
372
+ def custom_forward(*inputs):
373
+ if return_dict is not None:
374
+ return module(*inputs, return_dict=return_dict)
375
+ else:
376
+ return module(*inputs)
377
+
378
+ return custom_forward
379
+
380
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
381
+ hidden_states = torch.utils.checkpoint.checkpoint(
382
+ create_custom_forward(block),
383
+ hidden_states=hidden_states,
384
+ attention_mask=attention_mask,
385
+ encoder_hidden_states=encoder_hidden_states,
386
+ encoder_attention_mask=encoder_hidden_mask,
387
+ rotary_freqs_cis=rotary_freqs_cis,
388
+ rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
389
+ temb=temb,
390
+ **ckpt_kwargs,
391
+ )
392
+
393
+ else:
394
+ hidden_states = block(
395
+ hidden_states=hidden_states,
396
+ attention_mask=attention_mask,
397
+ encoder_hidden_states=encoder_hidden_states,
398
+ encoder_attention_mask=encoder_hidden_mask,
399
+ rotary_freqs_cis=rotary_freqs_cis,
400
+ rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
401
+ temb=temb,
402
+ )
403
+
404
+ for ssl_encoder_depth in self.ssl_encoder_depths:
405
+ if index_block == ssl_encoder_depth:
406
+ inner_hidden_states.append(hidden_states)
407
+
408
+ proj_losses = []
409
+ if len(inner_hidden_states) > 0 and ssl_hidden_states is not None and len(ssl_hidden_states) > 0:
410
+
411
+ for inner_hidden_state, projector, ssl_hidden_state, ssl_name in zip(inner_hidden_states, self.projectors, ssl_hidden_states, self.ssl_names):
412
+ if ssl_hidden_state is None:
413
+ continue
414
+ # 1. N x T x D1 -> N x D x D2
415
+ est_ssl_hidden_state = projector(inner_hidden_state)
416
+ # 3. projection loss
417
+ bs = inner_hidden_state.shape[0]
418
+ proj_loss = 0.0
419
+ for i, (z, z_tilde) in enumerate(zip(ssl_hidden_state, est_ssl_hidden_state)):
420
+ # 2. interpolate
421
+ z_tilde = F.interpolate(z_tilde.unsqueeze(0).transpose(1, 2), size=len(z), mode='linear', align_corners=False).transpose(1, 2).squeeze(0)
422
+
423
+ z_tilde = torch.nn.functional.normalize(z_tilde, dim=-1)
424
+ z = torch.nn.functional.normalize(z, dim=-1)
425
+ # T x d -> T x 1 -> 1
426
+ target = torch.ones(z.shape[0], device=z.device)
427
+ proj_loss += self.cosine_loss(z, z_tilde, target)
428
+ proj_losses.append((ssl_name, proj_loss / bs))
429
+
430
+ output = self.final_layer(hidden_states, embedded_timestep, output_length)
431
+ if not return_dict:
432
+ return (output, proj_losses)
433
+
434
+ return Transformer2DModelOutput(sample=output, proj_losses=proj_losses)
435
+
436
+ # @torch.compile
437
+ def forward(
438
+ self,
439
+ hidden_states: torch.Tensor,
440
+ attention_mask: torch.Tensor,
441
+ encoder_text_hidden_states: Optional[torch.Tensor] = None,
442
+ text_attention_mask: Optional[torch.LongTensor] = None,
443
+ speaker_embeds: Optional[torch.FloatTensor] = None,
444
+ lyric_token_idx: Optional[torch.LongTensor] = None,
445
+ lyric_mask: Optional[torch.LongTensor] = None,
446
+ timestep: Optional[torch.Tensor] = None,
447
+ ssl_hidden_states: Optional[List[torch.Tensor]] = None,
448
+ block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
449
+ controlnet_scale: Union[float, torch.Tensor] = 1.0,
450
+ return_dict: bool = True,
451
+ ):
452
+ encoder_hidden_states, encoder_hidden_mask = self.encode(
453
+ encoder_text_hidden_states=encoder_text_hidden_states,
454
+ text_attention_mask=text_attention_mask,
455
+ speaker_embeds=speaker_embeds,
456
+ lyric_token_idx=lyric_token_idx,
457
+ lyric_mask=lyric_mask,
458
+ )
459
+
460
+ output_length = hidden_states.shape[-1]
461
+
462
+ output = self.decode(
463
+ hidden_states=hidden_states,
464
+ attention_mask=attention_mask,
465
+ encoder_hidden_states=encoder_hidden_states,
466
+ encoder_hidden_mask=encoder_hidden_mask,
467
+ timestep=timestep,
468
+ ssl_hidden_states=ssl_hidden_states,
469
+ output_length=output_length,
470
+ block_controlnet_hidden_states=block_controlnet_hidden_states,
471
+ controlnet_scale=controlnet_scale,
472
+ return_dict=return_dict,
473
+ )
474
+
475
+ return output
models/attention.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Tuple, Union
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from torch import nn
19
+
20
+ from diffusers.utils import logging
21
+ from diffusers.models.normalization import RMSNorm
22
+
23
+
24
+ try:
25
+ # from .dcformer import DCMHAttention
26
+ from .customer_attention_processor import Attention, CustomLiteLAProcessor2_0, CustomerAttnProcessor2_0
27
+ except ImportError:
28
+ # from dcformer import DCMHAttention
29
+ from customer_attention_processor import Attention, CustomLiteLAProcessor2_0, CustomerAttnProcessor2_0
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ def val2list(x: list or tuple or any, repeat_time=1) -> list: # type: ignore
36
+ """Repeat `val` for `repeat_time` times and return the list or val if list/tuple."""
37
+ if isinstance(x, (list, tuple)):
38
+ return list(x)
39
+ return [x for _ in range(repeat_time)]
40
+
41
+
42
+ def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple: # type: ignore
43
+ """Return tuple with min_len by repeating element at idx_repeat."""
44
+ # convert to list first
45
+ x = val2list(x)
46
+
47
+ # repeat elements if necessary
48
+ if len(x) > 0:
49
+ x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
50
+
51
+ return tuple(x)
52
+
53
+
54
+ def t2i_modulate(x, shift, scale):
55
+ return x * (1 + scale) + shift
56
+
57
+
58
+ def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
59
+ if isinstance(kernel_size, tuple):
60
+ return tuple([get_same_padding(ks) for ks in kernel_size])
61
+ else:
62
+ assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number"
63
+ return kernel_size // 2
64
+
65
+ class ConvLayer(nn.Module):
66
+ def __init__(
67
+ self,
68
+ in_dim: int,
69
+ out_dim: int,
70
+ kernel_size=3,
71
+ stride=1,
72
+ dilation=1,
73
+ groups=1,
74
+ padding: Union[int, None] = None,
75
+ use_bias=False,
76
+ norm=None,
77
+ act=None,
78
+ ):
79
+ super().__init__()
80
+ if padding is None:
81
+ padding = get_same_padding(kernel_size)
82
+ padding *= dilation
83
+
84
+ self.in_dim = in_dim
85
+ self.out_dim = out_dim
86
+ self.kernel_size = kernel_size
87
+ self.stride = stride
88
+ self.dilation = dilation
89
+ self.groups = groups
90
+ self.padding = padding
91
+ self.use_bias = use_bias
92
+
93
+ self.conv = nn.Conv1d(
94
+ in_dim,
95
+ out_dim,
96
+ kernel_size=kernel_size,
97
+ stride=stride,
98
+ padding=padding,
99
+ dilation=dilation,
100
+ groups=groups,
101
+ bias=use_bias,
102
+ )
103
+ if norm is not None:
104
+ self.norm = RMSNorm(out_dim, elementwise_affine=False)
105
+ else:
106
+ self.norm = None
107
+ if act is not None:
108
+ self.act = nn.SiLU(inplace=True)
109
+ else:
110
+ self.act = None
111
+
112
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
113
+ x = self.conv(x)
114
+ if self.norm:
115
+ x = self.norm(x)
116
+ if self.act:
117
+ x = self.act(x)
118
+ return x
119
+
120
+
121
+ class GLUMBConv(nn.Module):
122
+ def __init__(
123
+ self,
124
+ in_features: int,
125
+ hidden_features: int,
126
+ out_feature=None,
127
+ kernel_size=3,
128
+ stride=1,
129
+ padding: Union[int, None] = None,
130
+ use_bias=False,
131
+ norm=(None, None, None),
132
+ act=("silu", "silu", None),
133
+ dilation=1,
134
+ ):
135
+ out_feature = out_feature or in_features
136
+ super().__init__()
137
+ use_bias = val2tuple(use_bias, 3)
138
+ norm = val2tuple(norm, 3)
139
+ act = val2tuple(act, 3)
140
+
141
+ self.glu_act = nn.SiLU(inplace=False)
142
+ self.inverted_conv = ConvLayer(
143
+ in_features,
144
+ hidden_features * 2,
145
+ 1,
146
+ use_bias=use_bias[0],
147
+ norm=norm[0],
148
+ act=act[0],
149
+ )
150
+ self.depth_conv = ConvLayer(
151
+ hidden_features * 2,
152
+ hidden_features * 2,
153
+ kernel_size,
154
+ stride=stride,
155
+ groups=hidden_features * 2,
156
+ padding=padding,
157
+ use_bias=use_bias[1],
158
+ norm=norm[1],
159
+ act=None,
160
+ dilation=dilation,
161
+ )
162
+ self.point_conv = ConvLayer(
163
+ hidden_features,
164
+ out_feature,
165
+ 1,
166
+ use_bias=use_bias[2],
167
+ norm=norm[2],
168
+ act=act[2],
169
+ )
170
+
171
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
172
+ x = x.transpose(1, 2)
173
+ x = self.inverted_conv(x)
174
+ x = self.depth_conv(x)
175
+
176
+ x, gate = torch.chunk(x, 2, dim=1)
177
+ gate = self.glu_act(gate)
178
+ x = x * gate
179
+
180
+ x = self.point_conv(x)
181
+ x = x.transpose(1, 2)
182
+
183
+ return x
184
+
185
+
186
+ class LinearTransformerBlock(nn.Module):
187
+ """
188
+ A Sana block with global shared adaptive layer norm (adaLN-single) conditioning.
189
+ """
190
+ def __init__(
191
+ self,
192
+ dim,
193
+ num_attention_heads,
194
+ attention_head_dim,
195
+ use_adaln_single=True,
196
+ cross_attention_dim=None,
197
+ added_kv_proj_dim=None,
198
+ context_pre_only=False,
199
+ mlp_ratio=4.0,
200
+ add_cross_attention=False,
201
+ add_cross_attention_dim=None,
202
+ qk_norm=None,
203
+ ):
204
+ super().__init__()
205
+
206
+ self.norm1 = RMSNorm(dim, elementwise_affine=False, eps=1e-6)
207
+ self.attn = Attention(
208
+ query_dim=dim,
209
+ cross_attention_dim=cross_attention_dim,
210
+ added_kv_proj_dim=added_kv_proj_dim,
211
+ dim_head=attention_head_dim,
212
+ heads=num_attention_heads,
213
+ out_dim=dim,
214
+ bias=True,
215
+ qk_norm=qk_norm,
216
+ processor=CustomLiteLAProcessor2_0(),
217
+ )
218
+
219
+ self.add_cross_attention = add_cross_attention
220
+ self.context_pre_only = context_pre_only
221
+
222
+ if add_cross_attention and add_cross_attention_dim is not None:
223
+ self.cross_attn = Attention(
224
+ query_dim=dim,
225
+ cross_attention_dim=add_cross_attention_dim,
226
+ added_kv_proj_dim=add_cross_attention_dim,
227
+ dim_head=attention_head_dim,
228
+ heads=num_attention_heads,
229
+ out_dim=dim,
230
+ context_pre_only=context_pre_only,
231
+ bias=True,
232
+ qk_norm=qk_norm,
233
+ processor=CustomerAttnProcessor2_0(),
234
+ )
235
+
236
+ self.norm2 = RMSNorm(dim, 1e-06, elementwise_affine=False)
237
+
238
+ self.ff = GLUMBConv(
239
+ in_features=dim,
240
+ hidden_features=int(dim * mlp_ratio),
241
+ use_bias=(True, True, False),
242
+ norm=(None, None, None),
243
+ act=("silu", "silu", None),
244
+ )
245
+ self.use_adaln_single = use_adaln_single
246
+ if use_adaln_single:
247
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
248
+
249
+ def forward(
250
+ self,
251
+ hidden_states: torch.FloatTensor,
252
+ encoder_hidden_states: torch.FloatTensor = None,
253
+ attention_mask: torch.FloatTensor = None,
254
+ encoder_attention_mask: torch.FloatTensor = None,
255
+ rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
256
+ rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
257
+ temb: torch.FloatTensor = None,
258
+ ):
259
+
260
+ N = hidden_states.shape[0]
261
+
262
+ # step 1: AdaLN single
263
+ if self.use_adaln_single:
264
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
265
+ self.scale_shift_table[None] + temb.reshape(N, 6, -1)
266
+ ).chunk(6, dim=1)
267
+
268
+ norm_hidden_states = self.norm1(hidden_states)
269
+ if self.use_adaln_single:
270
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
271
+
272
+ # step 2: attention
273
+ if not self.add_cross_attention:
274
+ attn_output, encoder_hidden_states = self.attn(
275
+ hidden_states=norm_hidden_states,
276
+ attention_mask=attention_mask,
277
+ encoder_hidden_states=encoder_hidden_states,
278
+ encoder_attention_mask=encoder_attention_mask,
279
+ rotary_freqs_cis=rotary_freqs_cis,
280
+ rotary_freqs_cis_cross=rotary_freqs_cis_cross,
281
+ )
282
+ else:
283
+ attn_output, _ = self.attn(
284
+ hidden_states=norm_hidden_states,
285
+ attention_mask=attention_mask,
286
+ encoder_hidden_states=None,
287
+ encoder_attention_mask=None,
288
+ rotary_freqs_cis=rotary_freqs_cis,
289
+ rotary_freqs_cis_cross=None,
290
+ )
291
+
292
+ if self.use_adaln_single:
293
+ attn_output = gate_msa * attn_output
294
+ hidden_states = attn_output + hidden_states
295
+
296
+ if self.add_cross_attention:
297
+ attn_output = self.cross_attn(
298
+ hidden_states=hidden_states,
299
+ attention_mask=attention_mask,
300
+ encoder_hidden_states=encoder_hidden_states,
301
+ encoder_attention_mask=encoder_attention_mask,
302
+ rotary_freqs_cis=rotary_freqs_cis,
303
+ rotary_freqs_cis_cross=rotary_freqs_cis_cross,
304
+ )
305
+ hidden_states = attn_output + hidden_states
306
+
307
+ # step 3: add norm
308
+ norm_hidden_states = self.norm2(hidden_states)
309
+ if self.use_adaln_single:
310
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
311
+
312
+ # step 4: feed forward
313
+ ff_output = self.ff(norm_hidden_states)
314
+ if self.use_adaln_single:
315
+ ff_output = gate_mlp * ff_output
316
+
317
+ hidden_states = hidden_states + ff_output
318
+
319
+ return hidden_states
models/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Transformer2DModel",
3
+ "_diffusers_version": "0.27.2",
4
+ "in_channels": 8,
5
+ "num_layers": 24,
6
+ "inner_dim": 2560,
7
+ "attention_head_dim": 128,
8
+ "num_attention_heads": 20,
9
+ "mlp_ratio": 2.5,
10
+ "out_channels": 8,
11
+ "max_position": 32768,
12
+ "rope_theta": 1000000.0,
13
+ "speaker_embedding_dim": 512,
14
+ "text_embedding_dim": 768,
15
+ "ssl_encoder_depths": [8, 8],
16
+ "ssl_names": ["mert", "m-hubert"],
17
+ "ssl_latent_dims": [1024, 768],
18
+ "patch_size": [16, 1],
19
+ "max_height": 16,
20
+ "max_width": 32768,
21
+ "lyric_encoder_vocab_size": 6693,
22
+ "lyric_hidden_size": 1024
23
+ }