kevinwang676 commited on Apr 5

Commit

fd82c69

verified ·

1 Parent(s): 0433746

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
00000309-00000300.wav +3 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/.gitattributes +38 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/.msc +0 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/.mv +1 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/README.md +119 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/__init__.py +0 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/added_tokens.json +3 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/config.json +39 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/generation_config.json +11 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/hf_rwkv_tokenizer.py +279 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/modeling_rwkv7.py +4 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/rwkv_vocab_v20230424.txt +0 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/special_tokens_map.json +6 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/tokenizer_config.json +28 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/README.md +14 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/asset/dingding.png +0 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/campplus.onnx +3 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/configuration.json +1 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/cosyvoice.yaml +116 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/flow.encoder.fp16.zip +3 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/hift.pt +3 -0
CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/spk2info.pt +3 -0
Inference.md +98 -0
LICENSE +201 -0
README.md +181 -3
Trump.wav +3 -0
_config.yml +3 -0
another.wav +3 -0
badXT_71.wav +3 -0
data/cosy/data/data_processor.py +128 -0
data/cosy/test/test_vq.py +171 -0
data/utils/convert_embeddings_2_pt.py +34 -0
data/utils/create_embeddings_from_raw.py +263 -0
data/utils/create_lm_corpus_from_raw.py +156 -0
data/utils/llm_dataset.py +206 -0
data/utils/test_utilities.py +31 -0
data/utils/utilitie.py +767 -0
eval/eval_seed_generate.py +66 -0
gradio/tts_demo_page.py +81 -0
mine.wav +0 -0
new.mp3 +0 -0
new.wav +3 -0
run_multiple_process.sh +137 -0
rwkvtts_requirements.txt +264 -0
third_party/cosyvoice/dataset/processor.py +435 -0
third_party/cosyvoice/flow/decoder.py +301 -0
third_party/cosyvoice/flow/flow.py +239 -0
third_party/cosyvoice/flow/flow_matching.py +217 -0
third_party/cosyvoice/flow/length_regulator.py +69 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Trump.wav filter=lfs diff=lfs merge=lfs -text
+new.wav filter=lfs diff=lfs merge=lfs -text
+badXT_71.wav filter=lfs diff=lfs merge=lfs -text
+zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
+00000309-00000300.wav filter=lfs diff=lfs merge=lfs -text
+another.wav filter=lfs diff=lfs merge=lfs -text
+zero_2_0.wav filter=lfs diff=lfs merge=lfs -text
+zero_shot_0.wav filter=lfs diff=lfs merge=lfs -text
+zero_1_0.wav filter=lfs diff=lfs merge=lfs -text
+zero_3_0.wav filter=lfs diff=lfs merge=lfs -text
+zero_0_0.wav filter=lfs diff=lfs merge=lfs -text

00000309-00000300.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:631608f5c8b931ece1d45adc7f40a3b3b0ae2ec056a8a08a3565b04cc5750a4b
+size 243244

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
+flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
+flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/.msc ADDED Viewed

Binary file (1.66 kB). View file

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/.mv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Revision:master,CreatedAt:1736490687

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/README.md ADDED Viewed

	@@ -0,0 +1,119 @@

+---
+license: apache-2.0
+language:
+- en
+- zh
+- ja
+- ko
+- fr
+- ar
+- es
+- pt
+metrics:
+- accuracy
+base_model:
+- BlinkDL/rwkv-7-world
+pipeline_tag: text-generation
+---
+# rwkv7-1.5B-world
+<!-- Provide a quick summary of what the model is/does. -->
+This is RWKV-7 model under flash-linear attention format.
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** Bo Peng, Yu Zhang, Songlin Yang, Ruichong Zhang
+- **Funded by:** RWKV Project (Under LF AI & Data Foundation)
+- **Model type:** RWKV7
+- **Language(s) (NLP):** English
+- **License:** Apache-2.0
+- **Parameter count:** 1.52B
+- **Tokenizer:** RWKV World tokenizer
+- **Vocabulary size:** 65,536
+### Model Sources
+<!-- Provide the basic links for the model. -->
+- **Repository:** https://github.com/fla-org/flash-linear-attention ; https://github.com/BlinkDL/RWKV-LM
+- **Paper:** With in Progress
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+Install `flash-linear-attention` and the latest version of `transformers` before using this model:
+```bash
+pip install git+https://github.com/fla-org/flash-linear-attention
+pip install 'transformers>=4.48.0'
+```
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+You can use this model just as any other HuggingFace models:
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained('fla-hub/rwkv7-1.5B-world', trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained('fla-hub/rwkv7-1.5B-world', trust_remote_code=True)
+model = model.cuda()
+prompt = "What is a large language model?"
+messages = [
+    {"role": "user", "content": "Who are you?"},
+    {"role": "assistant", "content": "I am a GPT-3 based model."},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=1024,
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
+print(response)
+```
+## Training Details
+### Training Data
+This model is trained on the World v3 with a total of 3.119 trillion tokens.
+#### Training Hyperparameters
+- **Training regime:** bfloat16, lr 4e-4 to 1e-5 "delayed" cosine decay, wd 0.1 (with increasing batch sizes during the middle)
+- **Final Loss:** 1.9965
+- **Token Count:** 3.119 trillion
+## Evaluation
+#### Metrics
+`lambada_openai`:
+before conversion: ppl 4.13 acc 69.4%
+after conversion: ppl 4.26 acc 68.8% (without apply temple)
+## FAQ
+Q: safetensors metadata is none.
+A: upgrade transformers to >=4.48.0: `pip install 'transformers>=4.48.0'`

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/__init__.py ADDED Viewed

File without changes

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<|rwkv_tokenizer_end_of_text|>": 0
+}

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_attn_implementation_autoset": true,
+  "a_low_rank_dim": 96,
+  "architectures": [
+    "RWKV7ForCausalLM"
+  ],
+  "attn": null,
+  "attn_mode": "fused_recurrent",
+  "auto_map": {
+    "AutoConfig": "modeling_rwkv7.RWKV7Config",
+    "AutoModel": "modeling_rwkv7.RWKV7Model",
+    "AutoModelForCausalLM": "modeling_rwkv7.RWKV7ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "decay_low_rank_dim": 96,
+  "eos_token_id": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": false,
+  "gate_low_rank_dim": 256,
+  "head_dim": 64,
+  "hidden_act": "sqrelu",
+  "hidden_ratio": 4.0,
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 2048,
+  "model_type": "rwkv7",
+  "norm_bias": true,
+  "norm_eps": 1e-05,
+  "norm_first": true,
+  "num_heads": null,
+  "num_hidden_layers": 24,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": true,
+  "v_low_rank_dim": 64,
+  "vocab_size": 65536
+}

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "bos_token_id": 0,
+    "eos_token_id": 0,
+    "pad_token_id": 0,
+    "max_window_size": 2147483647,
+    "do_sample": true,
+    "top_k": 65536,
+    "top_p": 1.0,
+    "temperature": 1.0,
+    "transformers_version": "4.48.0"
+}

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/hf_rwkv_tokenizer.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RWKV."""
+import os
+import re
+from typing import TYPE_CHECKING, List, Optional, Tuple
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+if TYPE_CHECKING:
+    pass
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "rwkv_vocab_v20230424.txt",
+}
+class TRIE:
+    __slots__ = tuple("ch,to,values,front".split(","))
+    to: list
+    values: set
+    def __init__(self, front=None, ch=None):
+        self.ch = ch
+        self.to = [None for ch in range(256)]
+        self.values = set()
+        self.front = front
+    def __repr__(self):
+        fr = self
+        ret = []
+        while fr != None:
+            if fr.ch != None:
+                ret.append(fr.ch)
+            fr = fr.front
+        return "<TRIE %s %s>" % (ret[::-1], self.values)
+    def add(self, key: bytes, idx: int = 0, val=None):
+        if idx == len(key):
+            if val is None:
+                val = key
+            self.values.add(val)
+            return self
+        ch = key[idx]
+        if self.to[ch] is None:
+            self.to[ch] = TRIE(front=self, ch=ch)
+        return self.to[ch].add(key, idx=idx + 1, val=val)
+    def find_longest(self, key: bytes, idx: int = 0):
+        u: TRIE = self
+        ch: int = key[idx]
+        while u.to[ch] is not None:
+            u = u.to[ch]
+            idx += 1
+            if u.values:
+                ret = idx, u, u.values
+            if idx == len(key):
+                break
+            ch = key[idx]
+        return ret
+class RWKV_TOKENIZER:
+    def __init__(self, file_name):
+        self.idx2token = {}
+        sorted = []  # must be already sorted
+        with open(file_name, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        for l in lines:
+            idx = int(l[: l.index(" ")])
+            x = eval(l[l.index(" ") : l.rindex(" ")])
+            x = x.encode("utf-8") if isinstance(x, str) else x
+            assert isinstance(x, bytes)
+            assert len(x) == int(l[l.rindex(" ") :])
+            sorted += [x]
+            self.idx2token[idx] = x
+        self.token2idx = {}
+        for k, v in self.idx2token.items():
+            self.token2idx[v] = int(k)
+        self.root = TRIE()
+        for t, i in self.token2idx.items():
+            _ = self.root.add(t, val=(t, i))
+    def encodeBytes(self, src: bytes):
+        idx: int = 0
+        tokens = []
+        while idx < len(src):
+            _idx: int = idx
+            idx, _, values = self.root.find_longest(src, idx)
+            assert idx != _idx
+            _, token = next(iter(values))
+            tokens.append(token)
+        return tokens
+    def decodeBytes(self, tokens):
+        return b"".join(map(lambda i: self.idx2token[i], tokens))
+    def encode(self, src):
+        if isinstance(src, str):
+            return [self.encodeBytes(src.encode("utf-8"))]
+        elif isinstance(src, list):
+            return [self.encodeBytes(s.encode("utf-8")) for s in src]
+    def decode(self, tokens):
+        return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
+        # try:
+        #     return self.decodeBytes(tokens).decode('utf-8')
+        # except:
+        #     return '\ufffd' # bad utf-8
+    def printTokens(self, tokens):
+        for i in tokens:
+            s = self.idx2token[i]
+            try:
+                s = s.decode("utf-8")
+            except:
+                pass
+            print(f"{repr(s)}{i}", end=" ")
+        print()
+class RwkvTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self, vocab_file, bos_token="<|rwkv_tokenizer_end_of_text|>", eos_token="<|rwkv_tokenizer_end_of_text|>", unk_token="<|rwkv_tokenizer_end_of_text|>", **kwargs
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'."
+            )
+        with open(vocab_file, "r", encoding="utf-8") as reader:
+            tokens = reader.readlines()
+        if "add_bos_token" in kwargs:
+            self.add_bos_token = kwargs["add_bos_token"]
+        else:
+            self.add_bos_token = False
+        self.trie_tokenizer = RWKV_TOKENIZER(vocab_file)
+        vocab = self.trie_tokenizer.token2idx
+        self.encoder = vocab
+        self.decoder = {v: k for k, v in vocab.items()}
+        self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
+        super().__init__(
+            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
+        )
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = self.encoder
+        vocab.update(self.added_tokens_encoder)
+        vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))
+        return vocab
+    def _tokenize(self, text, split_special_tokens=False):
+        # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
+        return self.trie_tokenizer.encode(text)[0]
+    def _convert_token_to_id(self, token):
+        return token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (byte) using the vocab."""
+        token = self.decoder.get(index, self.unk_token)
+        if isinstance(token, (bytes)):
+            token = token.decode("utf-8", errors="replace")
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
+        out_string = b"".join(
+            [k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]
+        ).decode("utf-8")
+        return out_string
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + "vocab.txt",
+            )
+        else:
+            vocab_file = (
+                filename_prefix + "-" if filename_prefix else ""
+            ) + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(
+                self.encoder.items(), key=lambda kv: kv[1]
+            ):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(str(token) + "\n")
+                index += 1
+        return (vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+        output = bos_token_ids + token_ids_0
+        if token_ids_1 is None:
+            return output
+        return output + bos_token_ids + token_ids_1
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        if not self.add_bos_token:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=False,
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0))
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/modeling_rwkv7.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from rwkvfla.models.rwkv7 import RWKV7ForCausalLM, RWKV7Model, RWKV7Config
+RWKV7ForCausalLM = RWKV7ForCausalLM
+RWKV7Model = RWKV7Model
+RWKV7Config = RWKV7Config

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/rwkv_vocab_v20230424.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|rwkv_tokenizer_end_of_text|>",
+  "eos_token": "\n\n",
+  "unk_token": "<|rwkv_tokenizer_end_of_text|>",
+  "pad_token": "<|rwkv_tokenizer_end_of_text|>"
+}

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/CosyVoice-BlankEN/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|rwkv_tokenizer_end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "hf_rwkv_tokenizer.RwkvTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|rwkv_tokenizer_end_of_text|>",
+  "pad_token": "<|rwkv_tokenizer_end_of_text|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "\n\n",
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "RwkvTokenizer",
+  "unk_token": "<|rwkv_tokenizer_end_of_text|>",
+  "use_fast": false,
+  "chat_template": "{{ '<|rwkv_tokenizer_end_of_text|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{'User: ' + message['content'] + '\n\n'}}{% elif message['role'] == 'system' %}{{'System: ' + message['content'] + '\n\n'}}{% elif message['role'] == 'assistant' %}{{'Assistant: ' + message['content'] + '\n\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+}

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+language:
+- zh
+- en
+- ko
+- ja
+base_model:
+- fla-hub/rwkv7-1.5B-world
+pipeline_tag: text-to-speech
+---
+This is TTS model combined with Cosy's FSQ and RWKV Language model.
+Please refer :
+https://github.com/yynil/RWKVTTS/blob/main/Inference.md
+to use this checkpoint.

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/asset/dingding.png ADDED Viewed

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework":"Pytorch","task":"text-to-speech"}

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/cosyvoice.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1986]
+__set_seed2: !apply:numpy.random.seed [1986]
+__set_seed3: !apply:torch.manual_seed [1986]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
+# fixed params
+sample_rate: 24000
+llm_input_size: 2048
+llm_output_size: 2048
+spk_embed_dim: 192
+qwen_pretrain_path: ''
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:model.llm.llm.RWKV7LM
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    speech_token_size: 6561
+    length_normalized_loss: True
+    lsm_weight: 0
+    vocab_size: 65548
+    llm: !ref <qwen_pretrain_path>
+    sampling: !name:cosyvoice.utils.common.ras_sampling
+        top_p: 0.8
+        top_k: 25
+        win_size: 10
+        tau_r: 0.1
+flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
+    input_size: 512
+    output_size: 80
+    spk_embed_dim: !ref <spk_embed_dim>
+    output_type: 'mel'
+    vocab_size: 6561
+    input_frame_rate: 25
+    only_mask_loss: True
+    token_mel_ratio: 2
+    pre_lookahead_len: 3
+    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
+        output_size: 512
+        attention_heads: 8
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 512
+        use_cnn_module: False
+        macaron_style: False
+    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
+        in_channels: 240
+        n_spks: 1
+        spk_emb_dim: 80
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
+            in_channels: 320
+            out_channels: 80
+            causal: True
+            channels: [256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 12
+            num_heads: 8
+            act_fn: 'gelu'
+hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 5, 3]
+    upsample_kernel_sizes: [16, 11, 7]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+# processor functions
+get_tokenizer: !name:utils.utilities.get_tokenizer
+    model_dir: !ref <qwen_pretrain_path>
+allowed_special: 'all'
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1920
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 480
+    win_size: 1920
+    fmin: 0
+    fmax: 8000
+    center: False

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/flow.encoder.fp16.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46d2539ad8bdb90026cd50cb42e45bd389f10108111d742b912feddca105aeb6
+size 116703414

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/hift.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d4af0d661a416c69544eec83ff9c070dc80c37ee53ef44af3a37d910c95bc21
+size 83364158

CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO/spk2info.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbc8f9064db35ee8163b538c0f6ed9fe0c3e2fe0f560cca910e578138d961285
+size 3281245

Inference.md ADDED Viewed

	@@ -0,0 +1,98 @@

+# Install the code base and the dependencies
+```bash
+git clone https://github.com/yynil/RWKVTTS
+```
+Add these two directories to the PYTHONPATH
+```bash
+export PYTHONPATH=$PYTHONPATH:/home/user/RWKVTTS:/home/user/RWKVTTS/third_party
+```
+# Install the dependencies
+```bash
+conda create -n rwkvtts-311 -y python=3.11
+conda activate rwkvtts-311
+conda install -y -c conda-forge pynini==2.1.6
+cd RWKVTTS
+pip install -r rwkvtts_requirements.txt
+```
+Download the pretrained models from the following links:
+https://huggingface.co/yueyulin/CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO
+Place the CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO to local directory. Let's say /home/user/CosyVoice2-0.5B-RWKV-7-1.5B-Instruct-CHENJPKO
+Add two directories to the PYTHONPATH
+The example code for inference is as follows:
+```python
+def do_tts(tts_text,prompt_texts,cosyvoice):
+    import logging
+    for i, (prompt_audio_file, prompt_text) in enumerate(zip(prompt_audios, prompt_texts)):
+        logging.info(f'Processing {prompt_text}')
+        prompt_speech_16k = load_wav(prompt_audio_file, 16000)
+        with torch.no_grad():
+            if prompt_text is not None:
+                for j, k in enumerate(cosyvoice.inference_zero_shot(tts_text,prompt_text, prompt_speech_16k, stream=False,speed=1)):
+                    torchaudio.save('zero_{}_{}.wav'.format(i, j), k['tts_speech'], cosyvoice.sample_rate)
+            else:
+                for j, k in enumerate(cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=False,speed=1)):
+                    torchaudio.save('zero_{}_{}.wav'.format(i, j), k['tts_speech'], cosyvoice.sample_rate)
+        logging.info(f'Finished processing {prompt_text}')
+if __name__ == '__main__':
+    from cosyvoice.cli.cosyvoice import CosyVoice2
+    import torch
+    import sys
+    # model_path = '/home/yueyulin/models/CosyVoice2-0.5B_RWKV_0.19B/'
+    # device = 'cuda:0'
+    print(sys.argv)
+    model_path = sys.argv[1]
+    device = sys.argv[2] if len(sys.argv) > 2 else 'cuda:0'
+    is_flow_only = sys.argv[3]=='True' if len(sys.argv) > 3 else False
+    print(f'is_flow_only: {is_flow_only}')
+    cosyvoice = CosyVoice2(model_path,device=device,fp16=False,load_jit=False)
+    from cosyvoice.utils.file_utils import load_wav
+    import torchaudio
+    prompt_audios = [
+        '/home/yueyulin/github/RWKVTTS/zero_shot_prompt.wav',
+        '/home/yueyulin/github/RWKVTTS/mine.wav',
+        '/home/yueyulin/github/RWKVTTS/new.wav',
+        '/home/yueyulin/github/RWKVTTS/Trump.wav',
+    ]
+    if not is_flow_only:
+        prompt_texts = [
+            '希望你以后做的比我还好呦。',
+            '少年强则中国强。',
+            '我随便说一句话，我喊开始录就开始录。',
+            'numbers of Latino, African American, Asian American and native American voters.'
+        ]
+    else:
+        prompt_texts = [
+            None,
+            None,
+            None,
+            None
+        ]
+    do_tts('Make America great again!',prompt_texts,cosyvoice)
+```
+More examples can be found in the model/test directory.
+[Instruct example](model/test/test_instructed.py) is an example to use the instructed voice flow to generate the audio.
+[Embedded ref voice example](model/test/test_speaker_adapter.py) is an example to use the speaker adapter to generate the audio.
+Please refer the [Service Call URL](service/README.md) for the instructions and reference voices.
+If you pass the prompt_texts as None, the engine will only clone the voice flow and texture which is good to clone voice cross lingual. If you pass the correct prompt texts to the engine, the engine will try to continue to finish the audio tokens following the prompt audio you provided. This will be good to continue the audio you provided but it will be weird when you try to mix languages.
+The test source code is [test code](model/test/test_initialize.py).
+Please change the paths to the correct paths in your system.
+You can also use your own prompt audio and text. Since the llm module is to finish your audio tokens for you, so please make sure the audio is clean,complete and the text is correct. Otherwise, the result may not be good.
+The following table shows the example results of the above code:
+| Prompt Audio | Prompt Text | TTS Text | Result |
+| --- | --- | --- | --- |
+| https://github.com/yynil/RWKVTTS/raw/main/zero_shot_prompt.wav | 希望你以后做的比我还好呦。 | 中国在东亚，是世界上最大的国家，也是世界上人口最多的国家。 | https://github.com/yynil/RWKVTTS/raw/main/zero_0_0.wav |
+| https://github.com/yynil/RWKVTTS/raw/main/mine.wav| 少年强则中国强。 | 中国在东亚，是世界上最大的国家，也是世界上人口最多的国家。 | https://github.com/yynil/RWKVTTS/raw/main/zero_1_0.wav |
+| https://github.com/yynil/RWKVTTS/raw/main/new.wav | 我随便说一句话，我喊开始录就开始录。 | 中国在东亚，是世界上最大的国家，也是世界上人口最���的国家。 | https://github.com/yynil/RWKVTTS/raw/main/zero_2_0.wav |

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,181 @@
----
-license: mit
----

+# RWKVTTS
+This project is to train an RWKV LLM for TTS generation which compatible to other TTS engine(like fish/cosy/chattts).
+For most of modern LLM based TTS engine, there are two parts :
+1. VQ VAE: this model is to encode audio to audio tokens and decode audio tokens to audio.
+2. LLM: this model is to generate audio tokens using text tokens and the prompt audio tokens. The prompt audio tokens are also from VQ VAE.
+Typically the training of the LLM based TTS involves VQ-VAE training and LLM training, like CosyTTS, ChatTTS and FishTTS. However we focus to train an RWKV LLM model to replace the LLM part in these TTS engines.
+```mermaid
+flowchart TB
+  node_1[["Input Prompt Text"]]
+  node_2(["Text Tokenizer"])
+  node_3(["Audio Tokenizer(VQ)"])
+  node_4[["Input Reference Audio"]]
+  node_5[["Text Tokens"]]
+  node_6[["Audio Tokens"]]
+  node_7(["Text Embedder"])
+  node_8(["Audio Embedder"])
+  node_9[["Text Embeddings"]]
+  node_10[["Audio Embeddings"]]
+  node_11(["Concatenate Embeddings"])
+  node_12[["Input Embeddings"]]
+  node_13{{"Language Model"}}
+  node_14[["Hidden States"]]
+  node_15(["Audio Head"])
+  node_16{"Continue to decode?"}
+  node_17(["Next Step Input"])
+  node_18(["Finish Decode"])
+  node_1 --> node_2
+  node_4 --> node_3
+  node_2 --> node_5
+  node_3 --> node_6
+  node_5 --> node_7
+  node_6 --> node_8
+  node_7 --> node_9
+  node_8 --> node_10
+  node_9 --> node_11
+  node_10 --> node_11
+  node_11 --> node_12
+  node_12 --> node_13
+  node_13 --> node_14
+  node_14 --> node_15
+  node_15 --> node_16
+  node_16 --"Yes"--> node_17
+  node_17 --> node_13
+  node_16 --"No"--> node_18
+```
+Different TTS engines might have different data layout and different special control token, so we need to prepare different data and train a RWKV LLM model for each TTS engine.
+# Process to train LLM for different TTS engine
+## Cosy 2.0
+### Cosy 2.0 Data Layout
+The layout of Cosy 2.0 for LLM:
+```mermaid
+flowchart LR
+  node_1[["SOS Embeddings"]]
+  node_2[["Text Embeddings"]]
+  node_3[["Task ID Embedings"]]
+  node_4[["Audio Embeddings"]]
+  node_5[["Last Audio Embeddings"]]
+  node_1 --- node_2
+  node_2 --- node_3
+  node_3 --> node_4
+  node_4 --> node_5
+```
+The forward of LLM for cosy 2.0:
+```mermaid
+graph TD
+    A[Input: batch] --> B[Extract tokens and lengths]
+    B --> C1[Prepare LLM Target]
+    B --> C2[Encode Text Tokens]
+    B --> C3[Generate SOS/EOS and Task ID Embeddings]
+    B --> C4[Encode Speech Tokens]
+    C1[Prepare LLM Target] --> D1["Create target sequence for each sample<br>[IGNORE_ID, ..., speech_tokens, EOS]"]
+    D1 --> D2[Pad and move target to device]
+    C2[Encode Text Tokens] --> E1[Apply text_embedding layer]
+    C3[Generate SOS/EOS and Task ID Embeddings] --> F1[Get SOS/EOS embeddings from llm_embedding]
+    C3 --> F2[Get task_id embeddings from llm_embedding]
+    C4[Encode Speech Tokens] --> G1[Apply speech_embedding layer]
+    E1 --> H[Unpad and pad sequence]
+    F1 --> H
+    F2 --> H
+    G1 --> H
+    H --> I1[Generate LM input]
+    H --> I2[Create attention mask]
+    I1 --> J[Run LLM forward pass]
+    I2 --> J
+    J --> K[Extract hidden states]
+    K --> L[Generate logits through llm_decoder]
+    D2 --> M[Compute loss and accuracy]
+    L --> M
+    M --> N[Return loss and accuracy]
+```
+There are some points to note for Cosy 2.0:
+1. The prompt audio tokens are used to act reference audio, LLM will generate audio tokens mimic the reference audio.
+2. '<|endofprompt|>' is used for prompt text, it is a special token to indicate this prompt is an instruction.
+### Cosy 2.0 Data Preparation
+1. Download reference audio files from https://huggingface.co/datasets/yueyulin/TTS_Reference and put them to folder $REF_AUDIO_DIR. These audios are used to generate audio tokens.
+2. Download Cosy 2.0-0.5B model from https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B and put it to folder $MODEL_DIR.
+3. Clone the Cosy 2.0 repo from:https://github.com/yynil/CosyVoice and follow the instruction to install the environment. In this repository, I change the codes to allow user to specify cuda device for multiple processes generation. If you have installed torch 2.6, please remember to force triton downgrading to 3.1.0.
+4. Prepare the text data for audio tokens's training dataset. Currently we support parquet files and jsonl files. The text field is the only required field in the data file. I download the parquet from [wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) for Chinese and Engish parquet files.
+5. Generate the audio tokens using the following command:
+```bash
+bash run_multiple_process.sh --parquet_files /home/yueyulin/data/wiki/zh/train-00000-of-00006.parquet /home/yueyulin/data/wiki/zh/train-00001-of-00006.parquet /home/yueyulin/data/wiki/zh/train-00002-of-00006.parquet /home/yueyulin/data/wiki/zh/train-00003-of-00006.parquet /home/yueyulin/data/wiki/zh/train-00004-of-00006.parquet /home/yueyulin/data/wiki/zh/train-00005-of-00006.parquet --language zh --prompts_dir extract_data/prompts/zh --device cuda:0 --output_dir /home/yueyulin/data/speech_corpus
+```
+The prompts_dir  is the $REF_AUDIO_DIR, the parquet_files are the list of files downloaded from wikimedia, each file is processed by one file. In my experience, one 4090 can process 6 files at the same time. The output_dir is the dirctory that audio tokens generated and saved.
+### Cosy 2.0 LLM Training
+After data is generated and saved, we will get the JSONL files like :
+```json
+{"text": "甄别重点监测企业是确保监测数据全面性和代表性的基础。首先，需要根据预警机制的覆盖范围和目标，明确监测企业的选择标准。选择标准可以包括企业规模、市场份额、行业影响力等。其次，通过企业调查、行业协会推荐等方式，初步筛选出符合条件的潜在监测企业。", "tts_speech_tokens": [2031, 4137, 6405, 6405, 6405, 6405, 6405, 6324, 6324, 6324, 6324, 6324, 6324, 4218, 1761, 4509, 2333, 4483, 5934, 6258, 1929, 3482, 314, 2300, 957, 5163, 6309, 5064, 6425, 3992, 1932, 80, 305, 734, 1479, 5650, 2472, 4778, 4487, 6175, 5667, 5373, 2187, 4851, 137, 141, 4919, 4407, 2436, 1295, 2024, 1294, 4940, 4778, 2330, 764, 1762, 2031, 1788, 5943, 5319, 5238, 5338, 3872, 1614, 4920, 6055, 6027, 3084, 5343, 4605, 2330, 218, 2172, 572, 1949, 1331, 865, 4921, 2472, 4688, 4379, 5850, 6342, 6373, 2997, 2529, 5087, 623, 3700, 6292, 6291, 5823, 5830, 2102, 1041, 6225, 6316, 3887, 889, 5487, 3813, 1626, 953, 734, 909, 4314, 4804, 4821, 4463, 23, 4683, 4678, 2724, 4832, 992, 1238, 2673, 324, 2099, 2486, 135, 2001, 4537, 5271, 2519, 957, 1699, 953, 1304, 1028, 4752, 2553, 5560, 4154, 1287, 59, 879, 4921, 2499, 5748, 5019, 240, 5889, 6264, 4293, 2186, 2105, 2005, 6405, 6405, 6324, 6324, 6324, 4137, 4218, 3651, 6048, 3132, 1433, 1457, 3962, 4515, 2482, 4490, 4561, 4669, 6054, 6270, 6316, 4615, 4781, 575, 632, 2031, 183, 4598, 4479, 6181, 5496, 4128, 3887, 1943, 1861, 6288, 5343, 6072, 3319, 2733, 322, 1187, 1727, 1807, 4921, 4677, 5668, 5019, 2427, 2976, 6066, 5332, 63, 73, 380, 4239, 6534, 6543, 5101, 1452, 213, 5921, 2273, 6453, 4347, 4537, 4459, 11, 2124, 866, 386, 485, 2511, 333, 632, 4317, 5772, 5803, 1457, 2163, 889, 5021, 2381, 5675, 5056, 5092, 1951, 3888, 3645, 4218, 6405, 6324, 4137, 1884, 1646, 2726, 377, 3992, 5529, 2481, 6054, 3822, 5340, 2330, 71, 2733, 2499, 5012, 4463, 5850, 6342, 6373, 2268, 4851, 137, 151, 4921, 4435, 4650, 528, 1295, 1295, 2023, 2753, 4850, 4570, 2243, 1047, 56, 113, 4512, 5568, 1662, 971, 5, 1480, 6387, 1045, 65, 460, 2160, 5102, 4568, 5056, 5098, 1602, 6048, 4367, 956, 59, 1524, 6405, 6405, 6324, 6324, 6324, 6324, 6324, 4137, 2031, 2706, 5325, 1653, 3887, 2219, 3667, 5664, 803, 4592, 2163, 5587, 4598, 5026, 5089, 1692, 5976, 1937, 146, 41, 1507, 1950, 2031, 0, 2349, 343, 4607, 5019, 566, 1683, 2166, 5051, 5678, 5057, 5830, 573, 2835, 2856, 5099, 707, 947, 1113, 4675, 4408, 4623, 1294, 2024, 2023, 3481, 4778, 2411, 1208, 1302, 660, 5827, 5345, 5074, 4560, 6501, 1403, 635, 716, 680, 5057, 4970, 1947, 3645, 1458, 1707, 6024, 6049, 5238, 5340, 1696, 5244, 1468, 1946, 509, 1318, 6534, 2800, 4510, 2234, 1991, 2017, 2018, 1370, 470, 2891, 4997, 1972, 1701, 5832, 1458, 1950, 4860, 5589, 1946, 1949, 509, 5369, 4966, 5019, 4849, 2411, 314, 1293, 1267, 377, 6421, 4800, 4416, 4893, 8, 1946, 1967, 1584, 4615, 5019, 2510, 867, 63, 245, 533, 1991, 4218, 6405, 6405, 6324, 6324, 6324, 6324, 6324, 4137, 1950, 4920, 4516, 276, 2024, 4777, 4194, 6373, 5643, 4851, 4448, 65, 1517, 1978, 4218, 6405, 4218, 2112, 1350, 4860, 5074, 5772, 6262, 672, 5097, 5090, 221, 1032, 4675, 4408, 285, 1295, 1294, 557, 4490, 228, 276, 4858, 4807, 2870, 1675, 6051, 1539, 4141, 1946, 4133, 6320, 4699, 982, 1950, 5832, 5835, 3645, 1947, 5589, 5589, 4136, 1946, 1235, 4642, 4993, 4857, 4598, 62, 4431, 4675, 285, 1043, 314, 2414, 2760, 2850, 5094, 3158, 1214, 1032, 2997, 2763, 5345, 5100, 402, 4677, 4857, 4543, 5, 1482, 2004, 56, 515, 1970, 2077, 6534, 3488, 5591, 5690, 5869, 5319, 2331, 5342, 1688, 1679, 1735, 4218, 6324, 6324, 6405, 4218, 2031, 5886, 6291, 6480, 2883, 5829, 5826, 2175, 5799, 5826, 2186, 2183, 5940, 5322, 120, 5918, 4571, 4687, 3813, 962, 737, 1561, 5886, 4077, 1429, 5831, 6560, 3644, 6429, 6507, 6534, 2101, 2186, 5097, 2682, 2673, 2017, 2576, 4594, 1005, 4785, 2760, 854, 1946, 683, 4844, 2733, 4695, 4840, 2192, 1482, 72, 29, 788, 1761, 4921, 4408, 2517, 566, 35, 2192, 5934, 4209, 5652, 4537, 5920, 278, 160, 3462, 4686, 5021, 4490, 5853, 3912, 6374, 2997, 4716, 2567, 140, 3462, 4435, 2436, 1295, 1295, 2023, 3482, 4769, 4598, 89, 1736, 4218, 6405, 6405, 6324, 6324, 4137], "prompt_text": "那么就在两侧的象限同时忙碌。", "llm_prompt_speech_token": [3686, 6324, 4137, 1959, 3666, 4376, 2836, 2127, 578, 2441, 1041, 2337, 6073, 3560, 1369, 5650, 4691, 5192, 2924, 89, 1687, 1539, 4218, 1848, 160, 4760, 2825, 1463, 1946, 1223, 1313, 2067, 5648, 2997, 2268, 2277, 4842, 4763, 308, 1038, 140, 842, 2983, 4672, 4650, 4696, 5995, 5603, 1238, 1238, 4672, 4650, 4777, 2474, 8, 767, 1731, 4299, 2079, 4941, 4947, 665, 719, 4319, 6424, 5067, 5967, 6048, 5967, 5238, 1523, 3875, 3872, 4314, 661, 1946, 1217, 500, 6422, 1506, 4852, 5831, 1457, 1448]}
+{"text": "Once all the Cabinet and Cabinet-level officers have been invested, the act of their investiture usually ends with a \"family photo\" of the new Administration around the new president and vice-president. For this photo, the new ministers' alignment and proximity to the president is dictated by the order of precedence, with the ministers who head older departments standing in the first row, and the heads of the newer departments standing in the back rows. Some departments, such as the Department of Defence, take precedence from prior departments now abolished.", "tts_speech_tokens": [764, 35, 1896, 4299, 6486, 4299, 4299, 4299, 4218, 651, 2112, 2131, 1403, 2792, 2207, 1725, 5401, 281, 575, 683, 4997, 3474, 4492, 195, 87, 5109, 5846, 6077, 2270, 2172, 3828, 4424, 4543, 1520, 1753, 6258, 4075, 141, 5109, 5845, 3647, 1188, 3987, 3750, 4414, 1516, 4180, 5014, 5348, 1441, 6534, 5075, 5100, 1274, 1301, 3569, 3488, 3996, 6183, 4752, 4919, 2328, 3158, 6071, 5264, 5482, 5403, 5844, 5837, 191, 2139, 1839, 2255, 831, 4508, 4576, 6255, 1857, 29, 2, 2228, 5482, 6459, 2004, 2253, 2267, 2255, 885, 2112, 1788, 5916, 5835, 5919, 5919, 5919, 4056, 4299, 2058, 2982, 1295, 305, 1463, 3647, 2383, 2112, 3054, 4603, 3043, 4272, 2260, 4841, 6029, 6062, 5329, 6256, 6465, 2386, 2921, 2204, 4429, 5647, 2085, 2490, 809, 159, 546, 5325, 5298, 917, 1688, 3863, 3872, 3884, 3481, 3480, 4130, 5993, 5979, 5322, 5257, 5634, 4691, 4533, 5100, 1277, 764, 5111, 5, 47, 3748, 4929, 2376, 3583, 2990, 6456, 2232, 2306, 6507, 6210, 4463, 5840, 2270, 4071, 5693, 4663, 5100, 5226, 6510, 6534, 2900, 2567, 137, 882, 1199, 2831, 632, 389, 4251, 4191, 73, 49, 3831, 404, 971, 4853, 4613, 4074, 4314, 2417, 3750, 4507, 4416, 4594, 3624, 5325, 962, 224, 404, 5295, 4596, 2238, 3670, 3848, 4339, 1676, 812, 2441, 6097, 3934, 2261, 3750, 1564, 3401, 6074, 5823, 1383, 4293, 3816, 3734, 2219, 4450, 5482, 2996, 150, 3063, 143, 3019, 3667, 149, 3748, 4278, 4347, 3485, 5270, 4858, 5239, 2568, 2028, 4050, 3011, 32, 2264, 4672, 2991, 888, 804, 149, 2234, 5934, 1744, 2112, 3975, 5916, 5943, 5919, 5943, 5919, 5946, 5916, 3972, 4299, 6402, 6534, 1927, 140, 1038, 2263, 4567, 4413, 5563, 4672, 3999, 6264, 4826, 2810, 2567, 228, 227, 2324, 2504, 1773, 6375, 77, 3831, 754, 3401, 4612, 6498, 4311, 2411, 831, 2255, 4414, 5320, 4920, 2328, 5345, 5169, 4752, 4763, 5014, 6449, 2687, 3413, 3647, 2276, 3670, 4069, 1883, 2330, 4499, 1525, 1762, 1490, 2921, 1639, 2166, 4050, 4304, 2837, 732, 6049, 5405, 2266, 910, 4315, 2399, 798, 4859, 4857, 1923, 4434, 4485, 5152, 4206, 4447, 1917, 2136, 3807, 3740, 5, 2264, 5166, 5409, 806, 2982, 878, 2258, 860, 1525, 1762, 3320, 5169, 2166, 546, 2994, 4526, 4056, 2112, 60, 2274, 2528, 5084, 231, 4450, 4597, 1938, 2163, 650, 5108, 2335, 4188, 4859, 1760, 2096, 2903, 4349, 1684, 873, 3872, 6059, 6058, 5976, 4299, 2136, 4050, 3740, 2, 4432, 6455, 2226, 886, 3063, 881, 71, 2234, 5937, 5650, 5238, 4296, 1422, 2342, 2139, 3462, 2261, 1641, 4314, 230, 186, 2965, 4523, 4509, 4999, 4839, 5345, 6070, 5263, 4839, 3813, 3018, 5825, 2926, 5106, 2924, 194, 147, 1433, 728, 2915, 477, 2325, 5330, 6070, 1527, 2421, 2166, 3564, 6166, 1865, 1676, 2092, 4068, 2255, 1483, 5658, 5726, 2085, 3219, 71, 35, 2219, 3828, 2210, 5047, 6100, 4526, 2934, 3909, 4511, 6453, 6534, 3367, 3863, 3146, 5241, 5323, 6054, 1872, 3881, 947, 380, 632, 2909, 2884, 4296, 5913, 5835, 5919, 5919, 5919, 5838, 3975, 2112, 3648, 2192, 831, 3906, 2222, 5118, 5111, 4487, 879, 5650, 4422, 5256, 6465, 4446, 4522, 3831, 2294, 5588, 5825, 3377, 6050, 1698, 147, 1920, 1404, 6328, 1622, 1676, 2083, 2124, 2336, 3669, 5402, 4269, 2490, 71, 8, 113, 1563, 395, 4238, 2510, 3016, 3936, 4430, 2163, 461, 5192, 5998, 5272, 1869, 651, 4302, 1685, 221, 380, 389, 803, 5412, 4753, 2244, 2028, 3648, 3729, 5916, 5919, 5916, 3732, 3975, 2112, 3894, 5239, 5648, 2250, 2918, 4807, 6258, 879, 4600, 2166, 3483, 6327, 6239, 1652, 1757, 1881, 128, 2264, 5935, 5631, 5729, 5482, 2198, 2309, 1329, 4756, 2263, 4448, 4437, 6454, 4272, 3465, 157, 66, 954, 2166, 5598, 3980, 3836, 1838, 2064, 4069, 2371, 2938, 4565, 4356, 789, 4612, 5940, 6510, 3270, 5, 737, 8, 2234, 3747, 5650, 5482, 4269, 303, 2193, 2447, 4849, 2112, 2085, 4050, 3739, 2192, 4428, 5486, 2253, 885, 2992, 2249, 5205, 3453, 4672, 6186, 6534, 6059, 4068, 2184, 4320, 3978, 4052, 1622, 926, 3140, 231, 157, 2160, 1404, 6084, 3809, 1598, 2092, 6255, 2234, 3750, 5405, 3459, 3669, 23, 1463, 974, 2675, 2891, 2166, 712, 5030, 5023, 5080, 2741, 308, 32, 2203, 5217, 4593, 1437, 303, 2112, 3975], "prompt_text": " So I am gonna do this right now. So let's do it.", "llm_prompt_speech_token": [1822, 5727, 5000, 930, 5015, 2912, 3616, 692, 1250, 1978, 4214, 3485, 2036, 1298, 2918, 5192, 5056, 5074, 5065, 4813, 3005, 3002, 3313, 4238, 795, 4523, 4520, 3038, 4496, 859, 1887, 2490, 3309, 6235, 5264, 6074, 6047, 5339, 5474, 4291, 2915, 2666, 3759, 4056, 4299, 3975, 6159, 6186, 6186, 6186, 5838, 5109, 3732, 2112, 2139, 3945, 4534, 4569, 4575, 6453, 5405, 4461, 4338, 5572, 3809, 2411, 1214, 1205, 3805, 4526, 4379, 2189, 3890, 3242, 1418, 2876, 5828, 2799, 5133, 5563, 5481, 2325, 155, 533, 2801, 3617, 725, 56, 4385, 834, 3444, 5482, 3273, 2166, 2328, 1908, 1372, 868]}
+```
+We use Deepspeed to train the model:
+```bash
+deepspeed --num_nodes 1 --num_gpus 4 train_scripts/train_llm.py --data_file /external_data/yueyudata/speech_corpus/ --model_name /external_data/models/rwkv7-1.5B-world/ --output_dir /external_data/yueyudata/cosy_voice_llm --max_length 2048 --wandb_project toy_cosy_llm --wandb_run_name server2_rwkv_7_1.5B --ds_param_offload True --ds_optimizer_offload True --ds_stage 2 --gradient_checkpointing True --logging_steps 10 --per_device_train_batch_size 8
+```
+The base model can be downloaded from https://huggingface.co/collections/fla-hub/rwkv7-6790fd37b4b6137b088a0d8a , just choose a proper model for your training.
+### Cosy 2.0 LLM Inference
+### Some samples
+#### Zero shot inference
+prompt audio :
+[prompt audio](mine.wav)
+prompt text: "今天天气挺不错的。"
+tts text: "收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。"
+tts audio:
+[tts audio](zero_shot_0.wav)
+### TODO:
+0. Drop prompt audio tokens randomly to simulate unconditional guided generation.
+1. Add special control tokens in Cosy 2.0 in RWKV tokenizer and add them to generate audio tokens again:
+```python
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
+            ]
+        }
+```
+2. Add special control tokens like dialects in RWKV7LM and generate audio tokens for training.
+3. Implement streaming generation for Cosy 2.0 in RWKV7LM.

Trump.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:296432bb06954080b77c04a88841d61928d936077f5162947359520fa17836be
+size 342108

_config.yml ADDED Viewed

	@@ -0,0 +1,3 @@

+markdown: kramdown
+kramdown:
+  parse_block_html: true

another.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d103efaf538db967559861dbcf9995b60eca582360a6add5cf27c3faf3a49e
+size 199724

badXT_71.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c5e28420eb8c4506a1988d484fe9270b8422161d733c567abfccd74c106ceb9
+size 794726

data/cosy/data/data_processor.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from pyexpat import model
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+import os
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from cosyvoice.cli.cosyvoice import CosyVoice2
+import json
+import torch
+def load_from_configuration(model_dir):
+    with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+        configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
+    return configs
+def init_process(model_dir,device):
+    cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=False, fp16=True,device=device)
+    # configs = load_from_configuration(model_dir)
+    # frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+    #                                       configs['feat_extractor'],
+    #                                       '{}/campplus.onnx'.format(model_dir),
+    #                                       '{}/speech_tokenizer_v2.onnx'.format(model_dir),
+    #                                       '{}/spk2info.pt'.format(model_dir),
+    #                                       configs['allowed_special'],
+    #                                       device)
+    frontend = cosyvoice.frontend
+    llm = cosyvoice.model.llm
+    return frontend,llm,cosyvoice
+def preprocess_prompts(frontend,prompts_dir):
+    language_results = {}
+    final_rate = 24000
+    for root, dirs, files in os.walk(prompts_dir):
+        for file in files:
+            if file.endswith('.json'):
+                json_file = os.path.join(root, file)
+                print(f"处理文件 {json_file}")
+                language = json_file.split('/')[-2]
+                if language not in language_results:
+                    language_results[language] = []
+                # 尝试不同的编码格式读取文件
+                try:
+                    with open(json_file, 'r', encoding='utf-8') as f:
+                        json_data = json.load(f)
+                except UnicodeDecodeError:
+                    try:
+                        # 尝试 GB2312/GBK 编码 (常用于中文)
+                        with open(json_file, 'r', encoding='gbk') as f:
+                            json_data = json.load(f)
+                    except UnicodeDecodeError:
+                        try:
+                            # 尝试 GB18030 编码 (扩展的中文编码)
+                            with open(json_file, 'r', encoding='gb18030') as f:
+                                json_data = json.load(f)
+                        except Exception as e:
+                            print(f"无法读取文件 {json_file}: {e}")
+                            continue
+                wav_file = json_file.replace('.json', '.wav')
+                prompt_text = json_data['text']
+                prompt_speech = torchaudio.load(wav_file, backend='soundfile')[0]
+                fake_tts_text = "a"
+                with torch.no_grad():
+                    model_input = frontend.frontend_zero_shot(fake_tts_text, prompt_text, prompt_speech,final_rate)
+                language_results[language].append((model_input,prompt_text))
+    return language_results
+def generate_speech_tokens(llm,frontend,tts_text,model_input,device):
+    tts_text = frontend.text_normalize(tts_text,split=False, text_frontend=True)
+    tts_text_token, tts_text_token_len = frontend._extract_text_token(tts_text)
+    tts_text_token_len = torch.tensor([tts_text_token.shape[1]], dtype=torch.int32).to(device)
+    prompt_text = model_input['prompt_text'].to(device)
+    prompt_text_len = torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(device)
+    llm_prompt_speech_token = model_input['llm_prompt_speech_token'].to(device)
+    prompt_speech_token_len = torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(device)
+    flow_prompt_speech_token = model_input['flow_prompt_speech_token'].to(device)
+    prompt_speech_feat = model_input['prompt_speech_feat'].to(device)
+    llm_embedding = model_input['llm_embedding'].to(device)
+    flow_embedding = model_input['flow_embedding'].to(device)
+    speech_tokens = []
+    for i in llm.inference(text = tts_text_token,
+                        text_len = tts_text_token_len,
+                        prompt_text = prompt_text,
+                        prompt_text_len = prompt_text_len,
+                        prompt_speech_token = llm_prompt_speech_token,
+                        prompt_speech_token_len = prompt_speech_token_len,
+                        embedding=llm_embedding
+                        ):
+        speech_tokens.append(i)
+    tts_speech_tokens = torch.tensor(speech_tokens).unsqueeze(dim=0).to(device)
+    return tts_speech_tokens
+if __name__ == '__main__':
+    model_dir = '/data/yueyu/models/CosyVoice2-0.5B'
+    prompts_dir = 'extract_data/prompts'
+    device = 'cuda:0'
+    frontend,llm,cosyvoice = init_process(model_dir
+                            ,device)
+    prompts = preprocess_prompts(frontend,prompts_dir)
+    print(prompts)
+    model_input = prompts['zh'][0][0]
+    prompt_text = prompts['zh'][0][1]
+    tts_text = '扫一扫，立即体验中国银行信用卡好礼、绑卡立减等热门活动，实时掌握更多优惠信息。'
+    tts_text = '在中国的一个偏远山区，有一位名叫李远的年轻人，他对集群通信系统有着浓厚的兴趣。每天晚上，他都会在自己的小屋里研究各种关于集群通信系统的资料，试图弄懂其中的原理和运作机制。他对这个领域的研究不仅仅停留在理论层面，还亲手制作了一些模型，试图通过实践来加深理解。'
+    tts_text = "歷史（现代汉语词汇，古典文言文称之为史），指人类社会过去的事件和行动，以及对这些事件行为有系统的记录、诠释和研究。歷史可提供今人理解過去，作為未來行事的參考依據，与伦理、哲学和艺术同属人类精神文明的重要成果。历史的第二个含义，即对过去事件的记录和研究，又称历史学”，或简称“史学”。隶属于历史学或与其密切相关的学科有年代学、编纂学、家谱学、古文字学、计量历史学、考古学、社会学和新闻学等，参见历史学。记录和研究历史的人称为历史学家，简称“史学家”，中国古代称为史官。记录历史的书籍称为史书，如《史記》、《汉书》等，粗分為「官修」與「民載」兩類。"
+    tts_text = "### 如何提高花样游泳水平"
+    tts_speech_tokens = generate_speech_tokens(llm,frontend,tts_text,model_input,device)
+    print(tts_speech_tokens)
+    flow_prompt_speech_token = model_input['flow_prompt_speech_token'].to(device)
+    prompt_speech_feat = model_input['prompt_speech_feat'].to(device)
+    llm_embedding = model_input['llm_embedding'].to(device)
+    flow_embedding = model_input['flow_embedding'].to(device)
+    cosyvoice.model.hift_cache_dict['xxxx'] = None
+    tts_speech = cosyvoice.model.token2wav(token=tts_speech_tokens,
+                                                prompt_token=flow_prompt_speech_token,
+                                                prompt_feat=prompt_speech_feat,
+                                                embedding=flow_embedding,
+                                                uuid='xxxx',
+                                                token_offset=0,
+                                                finalize=True,
+                                                speed=1.0)
+    print(f'tts_speech shape:{tts_speech.shape}')
+    tts_speech = tts_speech.cpu()
+    torchaudio.save('zh_tts_S.wav', tts_speech, 24000)
+    print(model_input)

data/cosy/test/test_vq.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from turtle import back
+from click import prompt
+import torch
+from cosyvoice.cli.cosyvoice import CosyVoice2
+print(torch.cuda.is_available())
+print(torch.cuda.current_device())
+print(torch.cuda.device(0))
+print(torch.cuda.device_count())
+model_path = '/data/yueyu/models/CosyVoice2-0.5B'
+# cosyvoice = CosyVoice2(model_path, load_jit=False, load_trt=False, fp16=False)
+# print(cosyvoice)
+# from cosyvoice.utils.file_utils import load_wav
+# import torchaudio
+# prompt_speech_16k = load_wav('/home/yueyulin/github/CosyVoice/asset/zero_shot_prompt.wav', 16000)
+# # prompt_speech_16k = torch.rand((1, 16000))
+# for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
+#     torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+# for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
+#     torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+# # instruct usage
+# for i, j in enumerate(cosyvoice.inference_instruct2('吾今朝早上去外婆家吃饭。', '用上海话说这句话', prompt_speech_16k, stream=False)):
+#     torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+from hyperpyyaml import load_hyperpyyaml
+import os
+def load_from_configuration(model_dir):
+    with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+        configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
+    return configs
+configs = load_from_configuration(model_path)
+print(configs)
+import torchaudio
+def load_wav(wav, target_sr):
+    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
+    speech = speech.mean(dim=0, keepdim=True)
+    if sample_rate != target_sr:
+        assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+    return speech
+zh_prompt_tar_file="/data/yueyu/data/Emilia-Dataset/Emilia/ZH/ZH-B000000.tar"
+en_prompt_tar_file="/data/yueyu/data/Emilia-Dataset/Emilia/EN/EN-B000000.tar"
+def load_file_list(tar_file):
+    #the files are FILE_NAME.mp3/FILE_NAME.json
+    #return all FILE_NAME as a list which has a mp3 and json
+    import tarfile
+    with tarfile.open(tar_file, 'r') as f:
+        file_names = f.getnames()
+    mp3_files = [i for i in file_names if i.endswith('.mp3')]
+    json_files = [i for i in file_names if i.endswith('.json')]
+    #filter mp3_files without corresponded json
+    mp3_files = [i for i in mp3_files if i.replace('.mp3', '.json') in json_files]
+    return mp3_files
+zh_files = load_file_list(zh_prompt_tar_file)
+print(zh_files[:10])
+en_files = load_file_list(en_prompt_tar_file)
+print(en_files[:10])
+import io
+def load_random_samples_from_tar(tar_file, files, num_samples,target_sr,max_duration=10):
+    import random
+    import tarfile
+    import json
+    samples = []
+    with tarfile.open(tar_file, 'r') as f:
+        for i in random.sample(files, len(files)):
+            mp3 = f.extractfile(i)
+            mp3_bytes = io.BytesIO(mp3.read())
+            speech, sample_rate = torchaudio.load(mp3_bytes,backend='soundfile')
+            json_file = f.extractfile(i.replace('.mp3', '.json'))
+            json_data = json.load(json_file)
+            duration = json_data['duration']
+            if duration > max_duration:
+                continue
+            speech = speech.mean(dim=0, keepdim=True)
+            if sample_rate != target_sr:
+                assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+                speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+            samples.append((speech, json_data,sample_rate))
+            if len(samples) == num_samples:
+                break
+    return samples
+target_sr = 16000
+zh_samples = load_random_samples_from_tar(zh_prompt_tar_file, zh_files, 10, target_sr)
+one_sample,one_json,sample_rate = zh_samples[0]
+print(one_json)
+print(sample_rate)
+torchaudio.save('zh_sample.wav', one_sample, target_sr)
+print(len(zh_samples))
+en_samples = load_random_samples_from_tar(en_prompt_tar_file, en_files, 10, target_sr)
+one_sample,one_json,sample_rate = en_samples[0]
+print(one_json)
+print(sample_rate)
+torchaudio.save('en_sample.wav', one_sample, target_sr)
+print(len(en_samples))
+def resample_audio(samples, target_sr):
+    resampled_samples = []
+    for i in samples:
+        speech, sample_rate = i
+        if sample_rate != target_sr:
+            assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+            speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+        resampled_samples.append((speech, sample_rate))
+    return resampled_samples
+prompt_text = zh_samples[0][1]['text']
+prompt_speech = zh_samples[0][0]
+print(prompt_text)
+print(prompt_speech)
+from cosyvoice.cli.cosyvoice import CosyVoice2
+cosyvoice = CosyVoice2(model_path, load_jit=False, load_trt=False, fp16=True)
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+frontend = cosyvoice.frontend
+prompt_text = frontend.text_normalize(prompt_text,split=False, text_frontend=True)
+print(f'normalized prompt_text:{prompt_text}')
+tts_text = '扫一扫，立即体验中国银行信用卡好礼、绑卡立减等热门活动，实时掌握更多优惠信息。'
+tts_text = "在中国的一个偏远山区，有一位名叫李远的年轻人，他对集群通信系统有着浓厚的兴趣。每天晚上，他都会在自己的小屋里研究各种关于集群通信系统的资料，试图弄懂其中的原理和运作机制。他对这个领域的研究不仅仅停留在理论层面，还亲手制作了一些模型，试图通过实践来加深理解。"
+tts_text = "歷史（现代汉语词汇，古典文言文称之为史），指人类社会过去的事件和行动，以及对这些事件行为有系统的记录、诠释和研究。歷史可提供今人理解過去，作為未來行事的參考依據，与伦理、哲学和艺术同属人类精神文明的重要成果。历史的第二个含义，即对过去事件的记录和研究，又称历史学”，或简称“史学”。隶属于历史学或与其密切相关的学科有年代学、编纂学、家谱学、古文字学、计量历史学、考古学、社会学和新闻学等，参见历史学。记录和研究历史的人称为历史学家，简称“史学家”，中国古代称为史官。记录历史的书籍称为史书，如《史記》、《汉书》等，粗分為「官修」與「民載」兩類。"
+tts_text = frontend.text_normalize(tts_text,split=False, text_frontend=True)
+print(f'normalized tts_text:{tts_text}')
+final_rate = 24000
+model_input = frontend.frontend_zero_shot(tts_text, prompt_text, prompt_speech,final_rate)
+print(model_input)
+llm = cosyvoice.model.llm
+device = cosyvoice.model.device
+text = model_input['text'].to(device)
+text_len = torch.tensor([text.shape[1]], dtype=torch.int32).to(device)
+prompt_text = model_input['prompt_text'].to(device)
+prompt_text_len = torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(device)
+llm_prompt_speech_token = model_input['llm_prompt_speech_token'].to(device)
+prompt_speech_token_len = torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(device)
+flow_prompt_speech_token = model_input['flow_prompt_speech_token'].to(device)
+prompt_speech_feat = model_input['prompt_speech_feat'].to(device)
+llm_embedding = model_input['llm_embedding'].to(device)
+flow_embedding = model_input['flow_embedding'].to(device)
+speech_tokens = []
+for i in llm.inference(text = text,
+                       text_len = text_len,
+                       prompt_text = prompt_text,
+                       prompt_text_len = prompt_text_len,
+                       prompt_speech_token = llm_prompt_speech_token,
+                       prompt_speech_token_len = prompt_speech_token_len,
+                       embedding=llm_embedding
+                       ):
+    speech_tokens.append(i)
+print(speech_tokens)
+tts_speech_tokens = torch.tensor(speech_tokens).unsqueeze(dim=0).to(device)
+print(f'tts_speech_tokens shape:{tts_speech_tokens.shape}')
+cosyvoice.model.hift_cache_dict['xxxx'] = None
+tts_speech = cosyvoice.model.token2wav(token=tts_speech_tokens,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid='xxxx',
+                                             token_offset=0,
+                                             finalize=True,
+                                             speed=1.0)
+print(f'tts_speech shape:{tts_speech.shape}')
+tts_speech = tts_speech.cpu()
+torchaudio.save('zh_tts.wav', tts_speech, final_rate)

data/utils/convert_embeddings_2_pt.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import numpy as np
+import sys
+import os
+import json
+from sklearn.cluster import KMeans
+jsonl_dir = sys.argv[1]
+output_file_name = sys.argv[2]
+# Load the embeddings from jsonl files the key is the name of the file
+embeddings = {}
+for file in os.listdir(jsonl_dir):
+    print("Processing", file)
+    if file.endswith("_embeddings.json"):
+        with open(os.path.join(jsonl_dir, file), "r") as f:
+            print("Loading", file)
+            data = json.load(f)
+            key_name = os.path.basename(file).replace("_embeddings.json", "")
+            np_array = np.array(data)
+            if np_array.shape[0] == 1:
+                np_array = np_array[0]
+            else:
+                #find the cluster center of the embeddings using kmeans
+                kmeans = KMeans(n_clusters=1, random_state=0, n_init = 'auto').fit(np_array)
+                np_array = kmeans.cluster_centers_[0]
+            embeddings[key_name]= {'embedding' : torch.tensor(np_array, dtype=torch.float32).unsqueeze(0)}
+torch.save(embeddings, output_file_name)
+print("Embeddings saved to", output_file_name)
+state_dict = torch.load(output_file_name)
+print("Loaded embeddings from", output_file_name)
+for key in state_dict:
+    print(key, state_dict[key]['embedding'].shape)

data/utils/create_embeddings_from_raw.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import os
+from re import A
+import whisper
+from librosa import resample
+import multiprocessing
+from tqdm import tqdm
+import onnxruntime
+from onnxruntime import InferenceSession
+import torch
+import pyarrow.parquet as pq
+import numpy as np
+import json
+import io
+import soundfile as sf
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+import mmap
+import os
+import pyarrow.parquet as pq
+import io
+import soundfile as sf
+import torchaudio.compliance.kaldi as kaldi
+import torch
+import numpy as np
+import onnxruntime
+def process_file(file_info):
+    """处理单个parquet文件的函数，每个进程调用一次"""
+    parquet_file, output_path, speaker_extractor, device = file_info
+    # 为每个进程创建独立的speech_tokenizer_session
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    ort_session = onnxruntime.InferenceSession(speaker_extractor, sess_options=option,
+                                               providers=["CPUExecutionProvider"])
+    results = {}
+    try:
+        # 创建目标文件名
+        base_filename = os.path.splitext(os.path.basename(parquet_file))[0]
+        output_file = os.path.join(output_path, f"{base_filename}_tokens.jsonl")
+        # 使用PyArrow读取parquet文件的元数据，获取总行数
+        parquet_metadata = pq.read_metadata(parquet_file)
+        total_rows = parquet_metadata.num_rows
+        batch_size = 100
+        # 使用 mmap 读取 parquet 文件
+        with open(parquet_file, 'rb') as f:
+            mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+            # 使用 io.BytesIO 将 mmap 对象包装成文件对象
+            buffer = io.BytesIO(mm)
+            pf = pq.ParquetFile(buffer)  # 使用 mmap 包装的 buffer
+            progress = tqdm(total=total_rows,
+                           desc=f"Processing {os.path.basename(parquet_file)}",
+                           position=multiprocessing.current_process()._identity[0] % 10)
+            current_row = 0
+            idx = 0
+            for batch in pf.iter_batches(batch_size=batch_size):
+                df_batch = batch.to_pandas()
+                # 处理当前批次中的每一行
+                for _, row in df_batch.iterrows():
+                    current_row += 1
+                    audio_obj = row['audio']
+                    audio_data = audio_obj['bytes']
+                    transcription = row['transcription']
+                    language = row['language']
+                    speaker = row['speaker']
+                    if speaker not in results:
+                        results[speaker] = {}
+                    if language not in results[speaker]:
+                        results[speaker][language] = []
+                    if len(results[speaker][language]) >= 10:
+                        progress.update(1)
+                        continue
+                    with io.BytesIO(audio_data) as audio_buffer:
+                        prompt_data, sample_rate = sf.read(audio_buffer)
+                        # 确保是单声道，并转换为float32
+                        if len(prompt_data.shape) > 1:
+                            prompt_data = prompt_data[:, 0]
+                        prompt_data = prompt_data.astype(np.float32)
+                        # 重采样到16kHz (如果需要)
+                        if sample_rate != 16000:
+                            prompt_data = resample(prompt_data, orig_sr=sample_rate, target_sr=16000)
+                        prompt_speech_16k = torch.tensor(prompt_data).unsqueeze(0)
+                        feat = kaldi.fbank(prompt_speech_16k,
+                            num_mel_bins=80,
+                            dither=0,
+                            sample_frequency=16000)
+                        feat = feat - feat.mean(dim=0,keepdim=True)
+                        embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+                        results[speaker][language].append(embedding)
+                    progress.update(1)
+            # 关闭 mmap 对象
+            mm.close()
+        print(f'All speakers {results.keys()}')
+        for speaker in results:
+            print(f'{speaker} : All languages {results[speaker].keys()} in {os.getpid()}')
+        return results
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"Error processing {parquet_file}: {str(e)}"
+def process_file_x(file_info):
+    """处理单个parquet文件的函数，每个进程调用一次"""
+    parquet_file, output_path, speaker_extractor, device = file_info
+    # 为每个进程创���独立的speech_tokenizer_session
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    ort_session = InferenceSession(speaker_extractor, sess_options=option,
+                                               providers=["CPUExecutionProvider"])
+    results = {}
+    try:
+        # 创建目标文件名
+        base_filename = os.path.splitext(os.path.basename(parquet_file))[0]
+        output_file = os.path.join(output_path, f"{base_filename}_tokens.jsonl")
+        # 使用PyArrow读取parquet文件的元数据，获取总行数
+        parquet_metadata = pq.read_metadata(parquet_file)
+        total_rows = parquet_metadata.num_rows
+        batch_size = 100
+        pf = pq.ParquetFile(parquet_file)
+        progress = tqdm(total=total_rows,
+                       desc=f"Processing {os.path.basename(parquet_file)}",
+                       position=multiprocessing.current_process()._identity[0] % 10)
+        current_row = 0
+        idx = 0
+        for batch in pf.iter_batches(batch_size=batch_size):
+            df_batch = batch.to_pandas()
+            # 处理当前批次中的每一行
+            for _, row in df_batch.iterrows():
+                current_row += 1
+                audio_obj = row['audio']
+                audio_data = audio_obj['bytes']
+                transcription = row['transcription']
+                language = row['language']
+                speaker = row['speaker']
+                if speaker not in results:
+                    results[speaker] = {}
+                if language not in results[speaker]:
+                    results[speaker][language] = []
+                if len(results[speaker][language]) >= 10:
+                    progress.update(1)
+                    continue
+                with io.BytesIO(audio_data) as buffer:
+                    prompt_data, sample_rate = sf.read(buffer)
+                    # 确保是单声道，并转换为float32
+                    if len(prompt_data.shape) > 1:
+                        prompt_data = prompt_data[:, 0]
+                    prompt_data = prompt_data.astype(np.float32)
+                    # 重采样到16kHz (如果需要)
+                    if sample_rate != 16000:
+                        prompt_data = resample(prompt_data, orig_sr=sample_rate, target_sr=16000)
+                    prompt_speech_16k = torch.tensor(prompt_data).unsqueeze(0)
+                    feat = kaldi.fbank(prompt_speech_16k,
+                        num_mel_bins=80,
+                        dither=0,
+                        sample_frequency=16000)
+                    feat = feat - feat.mean(dim=0,keepdim=True)
+                    embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+                    results[speaker][language].append(embedding)
+                progress.update(1)
+        print(f'All speakers {results.keys()}')
+        for speaker in results:
+            print(f'{speaker} : All languages {results[speaker].keys()} in {os.getpid()}')
+        return results
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"Error processing {parquet_file}: {str(e)}"
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, default='/external_data/yueyudata/starrail-voice')
+    parser.add_argument('--output_path',type=str,default='/external_data/yueyudata/starrail-voice-speaker-embeddings')
+    parser.add_argument('--speaker_extractor',type=str,default='/external_data/models/CosyVoice2-0.5B_RWKV_1.5B/campplus.onnx')
+    parser.add_argument('--device',type=str,default='cuda:0')
+    parser.add_argument('--num_processes',type=int,default=4)
+    args = parser.parse_args()
+    print(args)
+    data_path = args.data_path
+    output_path = args.output_path
+    device = args.device
+    speaker_extractor = args.speaker_extractor
+    num_processes = args.num_processes
+    # 确保输出目录存在
+    os.makedirs(output_path, exist_ok=True)
+    # 找到所有parquet文件
+    parquet_files = []
+    for root, dirs, files in os.walk(data_path):
+        for file in files:
+            if file.endswith('.parquet'):
+                parquet_files.append(os.path.join(root, file))
+    print(f'Found {len(parquet_files)} parquet files in {data_path}')
+    # 准备多进程参数
+    file_info_list = [(file, output_path, speaker_extractor, device) for file in parquet_files]
+    # 使用进程池处理文件
+    print(f"Starting processing with {num_processes} processes")
+    # 使用进程池处理文件
+    print(f"Starting processing with {num_processes} processes")
+    with multiprocessing.Pool(processes=num_processes) as pool:
+        results = pool.map(process_file, file_info_list)
+    # 输出处理结果
+    print('Processing complete,merge results')
+    final_results = {}
+    for result in results:
+        if isinstance(result, dict):
+            for speaker in result:
+                if speaker not in final_results:
+                    final_results[speaker] = {}
+                for language in result[speaker]:
+                    if language not in final_results[speaker]:
+                        final_results[speaker][language] = []
+                    final_results[speaker][language].extend(result[speaker][language])
+        else:
+            print(result)
+    # 输出结果
+    for speaker in final_results:
+        for language in final_results[speaker]:
+            output_file = os.path.join(output_path, f"{speaker}_{language}_embeddings.json")
+            print(f"Writing embeddings for {speaker} ({language}) to {output_file}")
+            with open(output_file, 'w', encoding='utf-8') as f_out:
+                json.dump(final_results[speaker][language], f_out)

data/utils/create_lm_corpus_from_raw.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import numpy as np
+import pandas as pd
+import json
+import io
+import torch
+import soundfile as sf
+import pyarrow.parquet as pq
+import whisper
+from librosa import resample
+import multiprocessing
+from tqdm import tqdm
+import onnxruntime
+from onnxruntime import InferenceSession
+def process_file(file_info):
+    """处理单个parquet文件的函数，每个进程调用一次"""
+    parquet_file, output_path, speech_tokenizer_model, device = file_info
+    # 为每个进程创建独立的speech_tokenizer_session
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    cuda_idx = int(device.split(':')[-1] if device is not None and 'cuda' in device else '0')
+    speech_tokenizer_session = InferenceSession(speech_tokenizer_model, sess_options=option,
+                                               providers=[("CUDAExecutionProvider", {"device_id": cuda_idx})
+                                                         if torch.cuda.is_available() else "CPUExecutionProvider"])
+    try:
+        # 创建目标文件名
+        base_filename = os.path.splitext(os.path.basename(parquet_file))[0]
+        output_file = os.path.join(output_path, f"{base_filename}_tokens.jsonl")
+        # 使用PyArrow读取parquet文件的元数据，获取总行数
+        parquet_metadata = pq.read_metadata(parquet_file)
+        total_rows = parquet_metadata.num_rows
+        batch_size = 1000
+        # 检查是否有已经处理过的文件，计算已处理的行数
+        processed_rows = 0
+        if os.path.exists(output_file):
+            with open(output_file, 'r', encoding='utf-8') as f_check:
+                for _ in f_check:
+                    processed_rows += 1
+            print(f"Found existing file {output_file} with {processed_rows} processed rows")
+        # 如果已经处理完所有行，跳过此文件
+        if processed_rows >= total_rows:
+            return f"Skipped {parquet_file}: all {total_rows} rows already processed"
+        # 逐批处理数据，以追加方式打开输出文件
+        with open(output_file, 'a' if processed_rows > 0 else 'w', encoding='utf-8') as f_out:
+            pf = pq.ParquetFile(parquet_file)
+            progress = tqdm(total=total_rows, initial=processed_rows,
+                           desc=f"Processing {os.path.basename(parquet_file)}",
+                           position=multiprocessing.current_process()._identity[0] % 10)
+            skip_rows = processed_rows
+            current_row = 0
+            for batch in pf.iter_batches(batch_size=batch_size):
+                df_batch = batch.to_pandas()
+                # 处理当前批次中的每一行
+                for _, row in df_batch.iterrows():
+                    current_row += 1
+                    # 跳过已处理的行
+                    if current_row <= skip_rows:
+                        continue
+                    audio_obj = row['audio']
+                    audio_data = audio_obj['bytes']
+                    transcription = row['transcription']
+                    language = row['language']
+                    speaker = row['speaker']
+                    with io.BytesIO(audio_data) as buffer:
+                        prompt_data, sample_rate = sf.read(buffer)
+                        # 确保是单声道，并转换为float32
+                        if len(prompt_data.shape) > 1:
+                            prompt_data = prompt_data[:, 0]
+                        prompt_data = prompt_data.astype(np.float32)
+                        # 重采样到16kHz (如果需要)
+                        if sample_rate != 16000:
+                            prompt_data = resample(prompt_data, orig_sr=sample_rate, target_sr=16000)
+                        prompt_speech_16k = torch.tensor(prompt_data).unsqueeze(0)
+                    feat = whisper.log_mel_spectrogram(prompt_speech_16k, n_mels=128)
+                    speech_token = speech_tokenizer_session.run(None,
+                                                       {speech_tokenizer_session.get_inputs()[0].name:
+                                                        feat.detach().cpu().numpy(),
+                                                        speech_tokenizer_session.get_inputs()[1].name:
+                                                        np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+                    # 写入结果
+                    f_out.write(json.dumps({'tts_speech_tokens':speech_token,
+                                           'text':transcription,
+                                           'language':language,
+                                           'speaker':speaker,
+                                           "prompt_text":"",
+                                           "llm_prompt_speech_token":[]},
+                                          ensure_ascii=False)+'\n')
+                    progress.update(1)
+                # 释放内存
+                del df_batch
+                import gc
+                gc.collect()
+        return f"Successfully processed {parquet_file}: {total_rows-processed_rows} new rows processed"
+    except Exception as e:
+        return f"Error processing {parquet_file}: {str(e)}"
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, default='/external_data/yueyudata/starrail-voice')
+    parser.add_argument('--output_path',type=str,default='/external_data/yueyudata/starrail-voice-voice_tokens')
+    parser.add_argument('--speech_tokenizer_model',type=str,default='/external_data/models/CosyVoice2-0.5B_RWKV_1.5B/speech_tokenizer_v2.onnx')
+    parser.add_argument('--device',type=str,default='cuda:0')
+    parser.add_argument('--num_processes',type=int,default=4)
+    args = parser.parse_args()
+    data_path = args.data_path
+    output_path = args.output_path
+    device = args.device
+    speech_tokenizer_model = args.speech_tokenizer_model
+    num_processes = args.num_processes
+    # 确保输出目录存在
+    os.makedirs(output_path, exist_ok=True)
+    # 找到所有parquet文件
+    parquet_files = []
+    for root, dirs, files in os.walk(data_path):
+        for file in files:
+            if file.endswith('.parquet'):
+                parquet_files.append(os.path.join(root, file))
+    print(f'Found {len(parquet_files)} parquet files in {data_path}')
+    # 准备多进程参数
+    file_info_list = [(file, output_path, speech_tokenizer_model, device) for file in parquet_files]
+    # 使用进程池处理文件
+    print(f"Starting processing with {num_processes} processes")
+    with multiprocessing.Pool(processes=num_processes) as pool:
+        results = pool.map(process_file, file_info_list)
+    # 输出处理结果
+    for result in results:
+        print(result)
+    print("All files processed successfully!")

data/utils/llm_dataset.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import datasets
+import os
+import json
+import torch
+import random
+import time
+random.seed(time.time())
+import logging
+from tqdm import tqdm
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def verify_jsonl_files(data_files):
+    """检查每个 jsonl 文件的有效性"""
+    invalid_files = []
+    for file_path in tqdm(data_files, desc="验证文件"):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for i, line in enumerate(f):
+                    try:
+                        json.loads(line)
+                    except json.JSONDecodeError:
+                        invalid_files.append((file_path, i+1))
+                        logging.error(f"文件 {file_path} 在第 {i+1} 行有无效的 JSON")
+                        break
+        except Exception as e:
+            invalid_files.append((file_path, f"读取错误: {str(e)}"))
+            logging.error(f"无法读取文件 {file_path}: {str(e)}")
+    return invalid_files
+def load_jsonl_dataset(directory,tokenizer):
+    '''
+    load jsonl files in a directory recursively
+    '''
+    data_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.jsonl'):
+                data_files.append(os.path.join(root, file))
+    logging.info(f"找到 {len(data_files)} 个 JSONL 文件")
+    # 验证文件
+    invalid_files = verify_jsonl_files(data_files)
+    if invalid_files:
+        logging.error(f"发现 {len(invalid_files)} 个无效文件:")
+        for file_info in invalid_files:
+            if isinstance(file_info[1], int):
+                logging.error(f"  - {file_info[0]} (错误在第 {file_info[1]} 行)")
+            else:
+                logging.error(f"  - {file_info[0]} ({file_info[1]})")
+        # 移除无效文件
+        valid_files = [f for f in data_files if f not in [info[0] for info in invalid_files]]
+        logging.info(f"继续处理剩余的 {len(valid_files)} 个有效文件")
+        data_files = valid_files
+    # 手动收集所有样本，确保特征一致性
+    all_samples = []
+    for file_path in tqdm(data_files, desc="加载数据集"):
+        try:
+            # 手动解析JSONL文件，避免datasets加载时的类型推断问题
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    try:
+                        data = json.loads(line)
+                        # 确保所有字段存在且类型一致
+                        llm_prompt_speech_token = data.get('llm_prompt_speech_token', [])
+                        tts_speech_tokens = data.get('tts_speech_tokens', [])
+                        text = str(data.get('text', ""))
+                        prompt_text = str(data.get('prompt_text', ""))
+                        # 确保列表类型
+                        if not isinstance(llm_prompt_speech_token, list):
+                            llm_prompt_speech_token = []
+                        if not isinstance(tts_speech_tokens, list):
+                            tts_speech_tokens = []
+                        # 添加处理后的样本
+                        all_samples.append({
+                            'llm_prompt_speech_token': llm_prompt_speech_token,
+                            'tts_speech_tokens': tts_speech_tokens,
+                            'text': text,
+                            'prompt_text': prompt_text
+                        })
+                    except json.JSONDecodeError:
+                        continue  # 跳过无效的JSON行
+                    except Exception as e:
+                        logging.error(f"处理样本时出错: {str(e)}")
+        except Exception as e:
+            logging.error(f"打开文件 {file_path} 时出错: {str(e)}")
+    if not all_samples:
+        raise ValueError("没有成功加载任何样本")
+    # 创建数据集
+    logging.info(f"手动创建数据集，包含 {len(all_samples)} 个样本")
+    dataset = datasets.Dataset.from_list(all_samples)
+    logging.info(f"成功加载 {len(dataset)} 个样本")
+    #1. concatenate llm_prompt_speech_token and tts_speech_tokens (list of int)
+    #delay the concatenation to collate_fn since sometimes we want to drop the prompt
+    # dataset = dataset.map(lambda x: {'speech_token': x['llm_prompt_speech_token'] + x['tts_speech_tokens']},remove_columns=['tts_speech_tokens','llm_prompt_speech_token'])
+    #2. Filter the data either :
+    #   1. the length of the speech_token is less than 1
+    #   2. the length of the speech_token is greater than 1000
+    #   3. the length of the text is greater than 500
+    #   4. the length of the prompt_text is greater than 500
+    #   5. the length of the text_token is less than 1
+    #   6. the length of the prompt_text_token is less than 1
+    dataset = dataset.filter(lambda x:len(x['llm_prompt_speech_token']) < 2048 and len(x['tts_speech_tokens']) < 2048
+                             and len(tokenizer.encode(x['text'])) < 2048 and len(tokenizer.encode(x['prompt_text'])) < 2048 )
+    logging.info(f"过滤后剩余 {len(dataset)} 个样本")
+    #2. tokenize the text to text_tokens and prompt_text to prompt_text_tokens
+    # dataset = dataset.map(lambda x: {'text_tokens': tokenizer.encode(x['text']), 'prompt_text_tokens': tokenizer.encode(x['prompt_text'])},remove_columns=['text','prompt_text'])
+    return dataset
+def collate_fn(batch, tokenizer, pad_to_max_length=True, max_length=2048, drop_prompt_audio_rate=-0.1):
+    '''
+    convert the data to torch tensors
+    1. call tokenizer.encode('text') and tokenizer.encode('prompt_text'), concatenate them to get the text_token, record each sample's length to text_token_len
+    2. convert the text_tokens and text_token_len to torch tensor
+    3. record each sample's speech_token length to speech_token_len
+    4. convert the speech_token and speech_token_len to torch tensor
+    5. We will drop prompt with drop_prompt_audio_rate to ask model to learn generate audio without guaidance
+    By default we won't drop anything
+    '''
+    all_text_tokens = []
+    all_speech_tokens = []
+    speech_token_len = []
+    text_token_len = []
+    my_max_length = 0
+    is_drop_prompt = random.random() < drop_prompt_audio_rate
+    for sample in batch:
+        tts_speech_tokens = sample['tts_speech_tokens']
+        llm_prompt_speech_token = sample['llm_prompt_speech_token']
+        if is_drop_prompt:
+            # 只使用文本部分，不使用提示
+            text_tokens = tokenizer.encode(sample['text'])
+            all_text_tokens.append(torch.tensor(text_tokens, dtype=torch.int32))
+            text_token_len.append(len(text_tokens))
+            # 只使用语音部分，不使用提示语音
+            current_speech_tokens = tts_speech_tokens
+            all_speech_tokens.append(torch.tensor(current_speech_tokens, dtype=torch.int32))
+            speech_token_len.append(len(current_speech_tokens))
+            total_length = len(text_tokens) + len(current_speech_tokens)
+        else:
+            # 使用提示+文本
+            text_tokens = tokenizer.encode(sample['text'])
+            prompt_tokens = tokenizer.encode(sample['prompt_text'])
+            combined_text_tokens = prompt_tokens + text_tokens
+            all_text_tokens.append(torch.tensor(combined_text_tokens, dtype=torch.int32))
+            text_token_len.append(len(combined_text_tokens))
+            # 使用提示语音+语音
+            current_speech_tokens = llm_prompt_speech_token + tts_speech_tokens
+            all_speech_tokens.append(torch.tensor(current_speech_tokens, dtype=torch.int32))
+            speech_token_len.append(len(current_speech_tokens))
+            total_length = len(combined_text_tokens) + len(current_speech_tokens)
+        if total_length > my_max_length:
+            my_max_length = total_length
+    # 检查长度是否超出最大长度
+    skip = my_max_length > max_length
+    # 将列表转换为填充后的张量
+    all_text_tokens = torch.nn.utils.rnn.pad_sequence(all_text_tokens, batch_first=True, padding_value=0)
+    all_speech_tokens = torch.nn.utils.rnn.pad_sequence(all_speech_tokens, batch_first=True, padding_value=0)
+    # 如果需要填充到最大长度
+    if pad_to_max_length and not skip:
+        pad_length = max_length - my_max_length
+        if pad_length > 0:
+            all_speech_tokens = torch.nn.functional.pad(all_speech_tokens, (0, pad_length), value=0)
+    return {
+        'text_token': all_text_tokens,
+        'text_token_len': torch.tensor(text_token_len, dtype=torch.int32),
+        'speech_token': all_speech_tokens,  # 确保命名一致
+        'speech_token_len': torch.tensor(speech_token_len, dtype=torch.int32),
+        'skip': skip
+    }
+if __name__ == '__main__':
+    from transformers import AutoTokenizer
+    model_path = "/external_data/models/rwkv7-2.9B-world"
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    directory = '/external_data/yueyudata/speech_corpus'
+    dataset = load_jsonl_dataset(directory,tokenizer)
+    print(dataset)
+    print(dataset[0])
+    from functools import partial
+    collate_fn = partial(collate_fn,tokenizer=tokenizer,pad_to_max_length=False)
+    dataloader = torch.utils.data.DataLoader(dataset,batch_size=1,collate_fn=collate_fn)
+    for data in dataloader:
+        print(data)
+        print(data['speech_token'].shape)
+        print(data['text_token'].shape)
+        break

data/utils/test_utilities.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from data.utils.utilitie import generate_mixed_instructions
+if __name__ == '__main__':
+    print(generate_mixed_instructions('我来自中国。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('I am from China.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('我来自中国。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('这是一个拥有悠久历史的城市。'))
+    print(generate_mixed_instructions('I am from China.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))
+    print(generate_mixed_instructions('This is a city with a long history.',language='en'))

data/utils/utilitie.py ADDED Viewed

	@@ -0,0 +1,767 @@

+from concurrent.futures import thread
+from operator import is_
+from librosa import ex
+from regex import P
+from torch import device
+from tqdm import tqdm
+import tarfile
+import random
+import time
+import io
+import torchaudio
+import json
+import os
+import multiprocessing
+import torch
+from data.cosy.data.data_processor import init_process, preprocess_prompts
+import random
+from typing import List
+import torch
+import torchaudio
+import io
+'''
+Natural Language Instruction
+Emotion: 高兴(Happy), 悲伤(Sad), 惊讶(Surprised), 愤怒(Angry), 恐惧(Fearful), 厌恶(Disgusted), 冷
+静(Calm), 严肃(Serious)
+Speaking Rate: 快速(Fast), 非常快速(Very Fast), 慢速(Slow), 非常慢速(Very Slow)
+Dialect: 粤语, 四川话, 上海话, 郑州话, 长沙话, 天津话
+Role-playing: 神秘(Mysterious), 凶猛(Fierce), 好奇(Curious), 优雅(Elegant), 孤独(Lonely), 机器
+人(Robot), 小猪佩奇(Peppa), etc.
+Fine-grained Instruction
+Vocal Bursts: [laughter], [breath], etc.
+Vocal Features: <laughter></laughter>, <strong></strong>
+Examples
+- 你能用高兴的情感说吗？< |endofprompt| >今天真是太开心了，马上要放假了！I’m so happy,
+Spring Festival is coming!
+- Please speaking very fast.< |endofprompt| >Today is a happy day, full of laughter and joy.
+- 请问你能模仿粤语的口音吗？< |endofprompt| >多保重，早休息。
+- 尝试一下以机器人的角色和我交流。< |endofprompt| >接收知识光波！
+- [laughter]有时候，看着小孩子们的天真行为[laughter]，我们总会会心一笑。
+- She pursued her dreams with <strong>enthusiasm</strong> and <strong>grit</strong>.
+'''
+emotions = ['高兴', '悲伤', '惊讶', '愤怒', '恐惧', '厌恶', '冷静', '严肃']
+emotions_in_english = ['Happy', 'Sad', 'Surprised', 'Angry', 'Fearful', 'Disgusted', 'Calm', 'Serious']
+speaking_rates = ['快速', '非常快速', '慢速', '非常慢速']
+speaking_rates_in_english = ['Fast', 'Very Fast', 'Slow', 'Very Slow']
+dialects = ['普通话','粤语', '四川话', '上海话', '郑州话', '长沙话', '天津话']
+dialects_in_english = ['Mandarin','Cantonese', 'Sichuanese', 'Shanghainese', 'Zhengzhou Dialect', 'Changsha Dialect', 'Tianjin Dialect']
+role_playings = ['神秘', '凶猛', '好奇', '优雅', '孤独', '机器人', '小猪佩奇']
+role_playings_in_english = ['Mysterious', 'Fierce', 'Curious', 'Elegant', 'Lonely', 'Robot', 'Peppa']
+vocal_bursts = ['[laughter]', '[breath]']
+vocal_features = ['<laughter></laughter>', '<strong></strong>']
+end_of_prompt = '<|endofprompt|>'
+def generate_in_emotion_in_chinese(text :str):
+    templates = [
+        '你能用{}的情感说吗？{}{}',
+        '请用{}的情感说。{}{}',
+        '请用{}的情感表达。{}{}',
+        '请用{}的情感说一下。{}{}',
+        '请用{}的情感说一句。{}{}'
+    ]
+    select_emotion = random.choice(emotions)
+    return random.choice(templates).format(select_emotion,end_of_prompt,text)
+def generate_in_emotion_in_english(text :str):
+    templates = [
+        'Can you say it with {} emotion?{}{}',
+        'Please say it with {} emotion.{}{}',
+        'Please express it with {} emotion.{}{}',
+        'Please say it with {} emotion.{}{}',
+        'Please say a sentence with {} emotion.{}{}'
+    ]
+    select_emotion = random.choice(emotions_in_english)
+    return random.choice(templates).format(select_emotion,end_of_prompt,text)
+def generate_speaking_rate_in_chinese(text :str):
+    templates = [
+        '请用{}的语速说。{}{}',
+        '请用{}的语速说一下。{}{}',
+        '请用{}的语速说一句。{}{}',
+        '请用{}的语速表达。{}{}',
+        '请用{}的语速说。{}{}',
+        '请{}地说。{}{}',
+        '请{}地说一下。{}{}',
+        '请{}地说一句。{}{}',
+        '{}的说。{}{}',
+        '{}的说一下。{}{}',
+        '{}的说一句。{}{}',
+        '{}的表达。{}{}'
+    ]
+    select_rate = random.choice(speaking_rates)
+    template = random.choice(templates)
+    return template.format(select_rate,end_of_prompt,text)
+def generate_speaking_rate_in_english(text :str):
+    templates = [
+        'Please say it with {} speaking rate.{}{}',
+        'Say it with {} speaking rate.{}{}',
+        'Please say a sentence with {} speaking rate.{}{}',
+        'Please express it with {} speaking rate.{}{}',
+        'Please speak {}ly.{}{}',
+        'Speak {}ly.{}{}',
+        'Please say it {}ly.{}{}',
+        'Say it {}ly.{}{}'
+    ]
+    select_rate = random.choice(speaking_rates_in_english)
+    template = random.choice(templates)
+    return template.format(select_rate,end_of_prompt,text)
+def load_file_list(tar_file):
+    #the files are FILE_NAME.mp3/FILE_NAME.json
+    #return all FILE_NAME as a list which has a mp3 and json
+    import tarfile
+    with tarfile.open(tar_file, 'r') as f:
+        file_names = f.getnames()
+    mp3_files = [i for i in file_names if i.endswith('.mp3')]
+    json_files = [i for i in file_names if i.endswith('.json')]
+    #filter mp3_files without corresponded json
+    mp3_files = [i for i in mp3_files if i.replace('.mp3', '.json') in json_files]
+    return mp3_files
+def extract_prompt(input_tar_files, input_tar_languages, max_duration=5, num_samples=10, target_sr=16000, output_dir=None):
+    """
+    Extract prompt from tar files
+    Args:
+        input_tar_files: list of str, input tar files
+        input_tar_languages: list of str, input tar languages for each tar file, must be the same length as input_tar_files
+        max_duration: float, max duration of audio
+        num_samples: int, number of samples to extract
+        target_sr: int, target sample rate
+        output_dir: str, output directory
+    """
+    for tar_file, language in zip(input_tar_files, input_tar_languages):
+        print(f'Extracting prompt from {tar_file}...with language {language}')
+        random.seed(time.time())
+        samples = []
+        mp3_files = load_file_list(tar_file)
+        with tarfile.open(tar_file, 'r') as f:
+            progress_bar = tqdm(total=num_samples,desc=f'Extracting prompt from {tar_file}')
+            for i in random.sample(mp3_files, len(mp3_files)):
+                mp3 = f.extractfile(i)
+                mp3_bytes = io.BytesIO(mp3.read())
+                speech, sample_rate = torchaudio.load(mp3_bytes,backend='soundfile')
+                json_file = f.extractfile(i.replace('.mp3', '.json'))
+                json_data = json.load(json_file)
+                duration = json_data['duration']
+                if duration > max_duration:
+                    continue
+                speech = speech.mean(dim=0, keepdim=True)
+                if sample_rate != target_sr:
+                    assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+                    speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+                samples.append((speech, json_data,sample_rate))
+                progress_bar.update(1)
+                if len(samples) == num_samples:
+                    break
+        if output_dir is not None:
+            """
+            json looks like:
+            {'id': 'ZH_B00000_S01450_W000017', 'wav': 'ZH_B00000/ZH_B00000_S01450/mp3/ZH_B00000_S01450_W000017.mp3', 'text': '因此，我们认为流通性具有更广泛的含义。', 'duration': 4.193, 'speaker': 'ZH_B00000_S01450', 'language': 'zh', 'dnsmos': 3.3709}
+            """
+            output_dir_lang = os.path.join(output_dir, language)
+            os.makedirs(output_dir_lang, exist_ok=True)
+            progress_bar = tqdm(total=len(samples), desc=f'Saving samples to {output_dir_lang}')
+            for i, (speech, json_data, sample_rate) in enumerate(samples):
+                id = json_data['id']
+                wave_file = os.path.join(output_dir_lang, f'{id}.wav')
+                json_file = os.path.join(output_dir_lang, f'{id}.json')
+                torchaudio.save(wave_file, speech, target_sr)
+                with open(json_file, 'w') as f:
+                    json.dump(json_data, f,ensure_ascii=False)
+                progress_bar.update(1)
+        print(f'Extracted {len(samples)} samples from {tar_file} with language {language}')
+def generate_dialect_in_chinese(text: str):
+    templates = [
+        '请问你能模仿{}的口音吗？{}{}',
+        '请用{}的口音说一下。{}{}',
+        '用{}的口音说一句。{}{}',
+        '能用{}的口音读一下吗？{}{}',
+        '请尝试用{}的口音说这段话。{}{}',
+        '请以{}的口音表达。{}{}',
+        '请用{}的语调说。{}{}',
+        '试试用{}的方言说。{}{}',
+        '能否用{}的语调读出来？{}{}',
+        '请说一段{}。{}{}'
+    ]
+    select_dialect = random.choice(dialects)
+    return random.choice(templates).format(select_dialect, end_of_prompt, text)
+def generate_dialect_in_english(text: str):
+    templates = [
+        'Can you mimic the {} accent?{}{}',
+        'Please speak with a {} accent.{}{}',
+        'Say it with a {} accent.{}{}',
+        'Could you read this with a {} accent?{}{}',
+        'Please try to speak this with a {} accent.{}{}',
+        'Please express it with a {} accent.{}{}',
+        'Please use {} intonation.{}{}',
+        'Try speaking in {}.{}{}',
+        'Could you read this in {}?{}{}',
+        'Please say a passage in {}.{}{}'
+    ]
+    select_dialect = random.choice(dialects_in_english)
+    return random.choice(templates).format(select_dialect, end_of_prompt, text)
+def generate_role_playing_in_chinese(text: str):
+    templates = [
+        '尝试一下以{}的角色和我交流。{}{}',
+        '请以{}的角色说这句话。{}{}',
+        '假装你是{}，说一下这句话。{}{}',
+        '扮演{}来说这段话。{}{}',
+        '请用{}的语气说。{}{}',
+        '以{}的形象来表达。{}{}',
+        '你能用{}的方式说吗？{}{}',
+        '模仿{}说话。{}{}',
+        '请���{}的口吻说一下。{}{}',
+        '像{}一样说这句话。{}{}'
+    ]
+    select_role = random.choice(role_playings)
+    return random.choice(templates).format(select_role, end_of_prompt, text)
+def generate_role_playing_in_english(text: str):
+    templates = [
+        'Try to communicate with me as a {} character.{}{}',
+        'Please say this as a {} character.{}{}',
+        'Pretend you are {}, say this sentence.{}{}',
+        'Act as {} to say this passage.{}{}',
+        'Please speak with a {} tone.{}{}',
+        'Express this with a {} image.{}{}',
+        'Can you say this in a {} way?{}{}',
+        'Mimic {} speaking.{}{}',
+        'Please say this in the manner of {}.{}{}',
+        'Say this like {}.{}{}'
+    ]
+    select_role = random.choice(role_playings_in_english)
+    return random.choice(templates).format(select_role, end_of_prompt, text)
+def generate_vocal_bursts(text: str):
+    """
+    在文本中随机添加声音爆发标记，如[laughter]、[breath]等
+    """
+    templates = [
+        '{}{}',  # 在句首添加
+        '{}{}{}',  # 在句中添加
+        '{}{}'  # 在句末添加
+    ]
+    burst = random.choice(vocal_bursts)
+    template_choice = random.choice(templates)
+    if template_choice == '{}{}':  # 句首
+        return burst + text
+    elif template_choice == '{}{}{}':  # 句中
+        words = text.split()
+        if len(words) <= 3:  # 文本太短不分割
+            return burst + text
+        split_point = random.randint(1, len(words) - 1)
+        return ' '.join(words[:split_point]) + ' ' + burst + ' ' + ' '.join(words[split_point:])
+    else:  # 句末
+        return text + ' ' + burst
+def generate_vocal_features(text: str):
+    """
+    在文本中随机添加声音特征标记，如<laughter></laughter>、<strong></strong>等
+    支持中文和英文文本
+    """
+    feature = random.choice(vocal_features)
+    feature_start, feature_end = feature.split('><')
+    feature_start += '>'
+    feature_end = '<' + feature_end
+    # 检查是否为中文文本
+    has_chinese = any('\u4e00' <= char <= '\u9fff' for char in text)
+    if has_chinese:
+        # 处理中文文本
+        if len(text) <= 10:  # 文本太短，整个加强
+            return feature_start + text + feature_end
+        # 对中文处理，随机选择一个字符范围
+        text_len = len(text)
+        # 随机选择一个起始位置和一个范围长度
+        start_pos = random.randint(1, max(1, text_len // 2))  # 避免总是从句首开始
+        span_length = random.randint(1, min(5, text_len - start_pos))
+        end_pos = start_pos + span_length - 1
+        # 在选定位置插入标记
+        result = text[:start_pos] + feature_start + text[start_pos:end_pos+1] + feature_end + text[end_pos+1:]
+        return result
+    else:
+        # 处理英文文本
+        words = text.split()
+        if len(words) <= 3:  # 文本太短，整个加强
+            return feature_start + text + feature_end
+        # 随机选择一个词或短语来添加特征
+        start_idx = random.randint(0, len(words) - 1)
+        span_length = random.randint(1, min(3, len(words) - start_idx))  # 最多3个词
+        result = []
+        for i, word in enumerate(words):
+            if i == start_idx:
+                result.append(feature_start + word)
+            elif i == start_idx + span_length - 1:
+                result.append(word + feature_end)
+            else:
+                result.append(word)
+        return ' '.join(result)
+def generate_mixed_instructions(text: str, language="zh"):
+    """
+    混合多种指令类型，可以同时包含情感、语速、方言、角色扮演等
+    """
+    instruction_generators = []
+    if language == "zh":
+        instruction_generators = [
+            generate_in_emotion_in_chinese,
+            generate_speaking_rate_in_chinese,
+            generate_dialect_in_chinese,
+            generate_role_playing_in_chinese
+        ]
+    else:  # 英文
+        instruction_generators = [
+            generate_in_emotion_in_english,
+            generate_speaking_rate_in_english,
+            generate_dialect_in_english,
+            generate_role_playing_in_english
+        ]
+    # 随机选择1个generator
+    selected_generator = random.choice(instruction_generators)
+    # 可能会添加声音特征
+    text_with_features = text
+    if random.random() < 0.3:  # 30%的概率添加声音特征
+        text_with_features = generate_vocal_features(text)
+    # 可能会添加声音爆发
+    if random.random() < 0.2:  # 20%的概率添加声音爆发
+        text_with_features = generate_vocal_bursts(text_with_features)
+    # 应用选择的指令生成器
+    result = text_with_features
+    result = selected_generator(result)
+    return result
+frontend = None
+llm = None
+cosyvoice = None
+output_fp = None
+prompts = None
+global_device = None
+processed_count = 0
+def initialize_process(model_dir,prompts_dir,output_dir,device):
+    current_process = multiprocessing.current_process()
+    file_name = f'{output_dir}/{current_process.pid}.jsonl'
+    global frontend,llm,cosyvoice,output_fp,prompts,global_device
+    global_device = device
+    output_fp = open(file_name, 'w')
+    print(f'Initializing process with device {device} and output file {file_name}')
+    frontend,llm,cosyvoice = init_process(model_dir,device)
+    prompts = preprocess_prompts(frontend,prompts_dir)
+    print(f'load prompts {prompts.keys()}')
+    return frontend,llm,cosyvoice
+def generate_speech_tokens(llm,frontend,tts_text,model_input,device):
+    tts_text = frontend.text_normalize(tts_text,split=False, text_frontend=True)
+    tts_text_token, tts_text_token_len = frontend._extract_text_token(tts_text)
+    tts_text_token_len = torch.tensor([tts_text_token.shape[1]], dtype=torch.int32).to(device)
+    prompt_text = model_input['prompt_text'].to(device) if 'prompt_text' in model_input else torch.zeros(1, 0, dtype=torch.int32).to(device)
+    prompt_text_len = torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(device) if prompt_text is not None else torch.zeros(1, 0, dtype=torch.int32).to(device)
+    llm_prompt_speech_token = model_input['llm_prompt_speech_token'].to(device) if 'llm_prompt_speech_token' in model_input else torch.zeros(1, 0, dtype=torch.int32).to(device)
+    prompt_speech_token_len = torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(device) if llm_prompt_speech_token is not None else None
+    flow_prompt_speech_token = model_input['flow_prompt_speech_token'].to(device)
+    prompt_speech_feat = model_input['prompt_speech_feat'].to(device)
+    llm_embedding = model_input['llm_embedding'].to(device)
+    flow_embedding = model_input['flow_embedding'].to(device)
+    speech_tokens = []
+    with torch.no_grad():
+        for i in llm.inference(text = tts_text_token,
+                            text_len = tts_text_token_len,
+                            prompt_text = prompt_text,
+                            prompt_text_len = prompt_text_len,
+                            prompt_speech_token = llm_prompt_speech_token,
+                            prompt_speech_token_len = prompt_speech_token_len,
+                            embedding=llm_embedding
+                            ):
+            speech_tokens.append(i)
+    return speech_tokens
+def process_text(text,language):
+    global frontend,llm,cosyvoice,output_fp,prompts,processed_count,global_device
+    processed_count += 1
+    if processed_count % 100 == 0:
+        print(f'Processed {processed_count} samples')
+    tts_text = text
+    splits_txt_by_lines = tts_text.split('\n')
+    #remove the sentences with length less than 10
+    splits_txt_by_lines = [i.strip() for i in splits_txt_by_lines if len(i.strip()) > 10]
+    random.seed(time.time())
+    model_input,prompt_text = random.choice(prompts[language])
+    llm_prompt_speech_token = model_input['llm_prompt_speech_token'].cpu().tolist()
+    for tts_text in splits_txt_by_lines:
+        tts_speech_tokens = generate_speech_tokens(llm,frontend,tts_text,model_input,cosyvoice.device)
+        output_data = {
+            'text': tts_text,
+            'tts_speech_tokens': tts_speech_tokens,
+            'prompt_text': prompt_text,
+            'llm_prompt_speech_token': llm_prompt_speech_token[0]
+        }
+        output_fp.write(json.dumps(output_data,ensure_ascii=False)+'\n')
+        output_fp.flush()
+    return processed_count
+def process_jsonl_file(jsonl_file,language,process_pool):
+    print(f'Processing {jsonl_file}...')
+    count = 0
+    import json
+    with open(jsonl_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0:
+                continue
+            data = json.loads(line)
+            text = data['text']
+            count += 1
+            future = process_pool.submit(process_text,text,language)
+            print(f'processed {future.result()} requests')
+    print(f'Processed {count} samples from {jsonl_file}')
+    return count
+def process_parquet_file(parquet_file,language,process_pool):
+    print(f'Processing {parquet_file}...')
+    import pandas as pd
+    df = pd.read_parquet(parquet_file)
+    count = 0
+    for i in range(len(df)):
+        text = df.iloc[i]['text']
+        count += 1
+        future = process_pool.submit(process_text,text,language)
+        print(f'processed {future.result()} requests')
+    print(f'Processed {count} samples from {parquet_file}')
+    return count
+def generate_speech_tokens_single_process(cosy_model_dir, prompts_dir, output_dir, language, jsonl_files=None, parquet_files=None, device="cuda:0",is_cross_lingual=False,is_instructed=False):
+    """
+    单进程单线程版本的语音标记生成函数
+    """
+    import torch
+    import json
+    import os
+    import random
+    import time
+    import traceback
+    import logging
+    import sys
+    from datetime import datetime
+    from data.cosy.data.data_processor import init_process, preprocess_prompts
+    # 设置日志
+    output_dir_lang = os.path.join(output_dir, language)
+    os.makedirs(output_dir_lang, exist_ok=True)
+    process_id = os.getpid()
+    log_file = os.path.join(output_dir_lang, f'process_{process_id}_log.txt')
+    # 配置日志输出到文件和控制台
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+    logger = logging.getLogger(f'process_{process_id}')
+    # 记录启动信息
+    logger.info(f"='='='='='='='='='='='Instructed={is_instructed}'='='='='='='='='='='='='='='='='='")
+    logger.info(f"启动时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    logger.info(f"进程ID: {process_id}")
+    logger.info(f"设备: {device}")
+    logger.info(f"模型目录: {cosy_model_dir}")
+    logger.info(f"提示词目录: {prompts_dir}")
+    logger.info(f"输出目录: {output_dir_lang}")
+    if jsonl_files:
+        logger.info(f"JSONL文件: {jsonl_files}")
+    if parquet_files:
+        logger.info(f"Parquet文件: {parquet_files}")
+    logger.info(f"='='='='='='='='='='='='='='='='='='='='='='='='='='='='='")
+    output_fp = None
+    frontend = None
+    llm = None
+    cosyvoice = None
+    total_processed = 0
+    try:
+        # 初始化模型
+        logger.info(f'初始化模型，使用设备: {device}')
+        frontend, llm, cosyvoice = init_process(cosy_model_dir, device)
+        # 预处理提示
+        logger.info(f'开始预处理提示词')
+        prompts = preprocess_prompts(frontend, prompts_dir)
+        logger.info(f'加载提示完成: {prompts.keys()}')
+        output_file = os.path.join(output_dir_lang, f'{process_id}.jsonl')
+        output_fp = open(output_file, 'w')
+        # 处理函数
+        def process_single_text(text):
+            try:
+                tts_text = text
+                splits_txt_by_lines = tts_text.split('\n')
+                # 删除长度小于10的句子
+                splits_txt_by_lines = [i.strip() for i in splits_txt_by_lines if len(i.strip()) > 10]
+                if not splits_txt_by_lines:
+                    logger.warning(f"文本没有有效句子: '{text[:100]}...'")
+                    return 0
+                random.seed(time.time())
+                cross_linguals_map = {
+                    'zh': 'en',
+                    'en': 'zh'
+                }
+                try:
+                    model_input, prompt_text = random.choice(prompts[language if not is_cross_lingual else cross_linguals_map[language]])
+                except KeyError:
+                    logger.error(f"语言 '{language}' 在提示词中不存在! 可用语言: {list(prompts.keys())}")
+                    return 0
+                llm_prompt_speech_token = model_input['llm_prompt_speech_token'].cpu().tolist() if 'llm_prompt_speech_token' in model_input else []
+                processed_count = 0
+                for tts_text in splits_txt_by_lines:
+                    try:
+                        if is_instructed:
+                            tts_text = generate_mixed_instructions(tts_text, language)
+                            prompt_text = ""
+                            llm_prompt_speech_token[0]=[]
+                            if 'prompt_text' in model_input:
+                                del model_input['prompt_text']
+                            if 'prompt_text_len' in model_input:
+                                del model_input['prompt_text_len']
+                            if 'llm_prompt_speech_token' in model_input:
+                                del model_input['llm_prompt_speech_token']
+                            if 'llm_prompt_speech_token_len' in model_input:
+                                del model_input['llm_prompt_speech_token_len']
+                        # 生成语音标记
+                        tts_speech_tokens = generate_speech_tokens(llm, frontend, tts_text, model_input, device)
+                        output_data = {
+                            'text': tts_text,
+                            'tts_speech_tokens': tts_speech_tokens,
+                            'prompt_text': prompt_text,
+                            'llm_prompt_speech_token': llm_prompt_speech_token[0]
+                        }
+                        output_fp.write(json.dumps(output_data, ensure_ascii=False) + '\n')
+                        output_fp.flush()
+                        processed_count += 1
+                    except Exception as e:
+                        logger.error(f"处理单个句子时出错: '{tts_text[:100]}...'")
+                        logger.error(f"错误信息: {str(e)}")
+                        logger.error(traceback.format_exc())
+                return processed_count
+            except Exception as e:
+                logger.error(f"处理文本块时出错")
+                logger.error(f"错误信息: {str(e)}")
+                logger.error(traceback.format_exc())
+                return 0
+        # 收集要处理的文件
+        files_to_process = []
+        # 处理JSONL文件
+        if jsonl_files is not None:
+            logger.info(f"处理指定的JSONL文件")
+            for file in jsonl_files:
+                if file.endswith('.jsonl'):
+                    files_to_process.append(('jsonl', file))
+            logger.info(f"共有 {len([f for t, f in files_to_process if t == 'jsonl'])} 个JSONL文件需要处理")
+        # 处理Parquet文件
+        if parquet_files is not None:
+            logger.info(f"处理指定的Parquet文件")
+            for file in parquet_files:
+                if file.endswith('.parquet'):
+                    files_to_process.append(('parquet', file))
+            logger.info(f"共有 {len([f for t, f in files_to_process if t == 'parquet'])} 个Parquet文件需要处理")
+        # 顺序处理所有文件
+        for file_type, file_path in files_to_process:
+            logger.info(f'开始处理文件: {file_path}')
+            try:
+                if file_type == 'jsonl':
+                    # 处理JSONL文件
+                    # 首先计算文件总行数，用于进度条
+                    total_lines = 0
+                    with open(file_path, 'r') as f:
+                        for line in f:
+                            if line.strip():  # 只计算非空行
+                                total_lines += 1
+                    logger.info(f"JSONL文件 {file_path} 共有 {total_lines} 行")
+                    # 使用进度条处理文件
+                    with open(file_path, 'r') as f:
+                        from tqdm import tqdm
+                        progress_bar = tqdm(total=total_lines, desc=f'处理JSONL文件: {os.path.basename(file_path)}')
+                        file_processed = 0
+                        for line in f:
+                            line = line.strip()
+                            if len(line) == 0:
+                                continue
+                            try:
+                                data = json.loads(line)
+                                text = data['text']
+                                processed = process_single_text(text)
+                                total_processed += processed
+                                file_processed += processed
+                                progress_bar.update(1)
+                                progress_bar.set_postfix(total=total_processed)
+                            except Exception as e:
+                                logger.error(f"处理JSONL行时出错: {line[:100]}...")
+                                logger.error(f"错误信息: {str(e)}")
+                                logger.error(traceback.format_exc())
+                        progress_bar.close()
+                        logger.info(f"JSONL文件 {file_path} 完成处理，成功处理 {file_processed} 条记录")
+                elif file_type == 'parquet':
+                    # 处理Parquet文件
+                    try:
+                        import pandas as pd
+                        logger.info(f"加载Parquet文件: {file_path}")
+                        df = pd.read_parquet(file_path)
+                        logger.info(f"Parquet文件 {file_path} 共有 {len(df)} 行")
+                        from tqdm import tqdm
+                        progress_bar = tqdm(total=len(df), desc=f'处理Parquet文件: {os.path.basename(file_path)}')
+                        file_processed = 0
+                        for i in range(len(df)):
+                            try:
+                                text = df.iloc[i]['text']
+                                processed = process_single_text(text)
+                                total_processed += processed
+                                file_processed += processed
+                                progress_bar.update(1)
+                                progress_bar.set_postfix(total=total_processed)
+                            except Exception as e:
+                                logger.error(f"处理Parquet行 {i} 时出错")
+                                logger.error(f"错误信息: {str(e)}")
+                                logger.error(traceback.format_exc())
+                        progress_bar.close()
+                        logger.info(f"Parquet文件 {file_path} 完成处理，成功处理 {file_processed} 条记录")
+                    except ImportError:
+                        logger.error("处理Parquet文件需要pandas库，请安装: pip install pandas")
+                    except Exception as e:
+                        logger.error(f"处理Parquet文件 {file_path} 时出现错误")
+                        logger.error(f"错误信息: {str(e)}")
+                        logger.error(traceback.format_exc())
+            except Exception as e:
+                logger.error(f"处理文件 {file_path} 时出现错误")
+                logger.error(f"错误信息: {str(e)}")
+                logger.error(traceback.format_exc())
+        logger.info(f'总共成功处理 {total_processed} 个样本，结果保存到 {output_file}')
+    except Exception as e:
+        logger.error("处理过程中出现全局错误")
+        logger.error(f"错误信息: {str(e)}")
+        logger.error(traceback.format_exc())
+    finally:
+        # 确保资源正确关闭
+        logger.info("清理资源...")
+        if output_fp is not None:
+            try:
+                output_fp.close()
+                logger.info(f"关闭输出文件")
+            except Exception as e:
+                logger.error(f"关闭输出文件时出错: {str(e)}")
+        # 释放GPU资源
+        if torch.cuda.is_available():
+            try:
+                torch.cuda.empty_cache()
+                logger.info("已清理GPU缓存")
+            except Exception as e:
+                logger.error(f"清理GPU缓存时出错: {str(e)}")
+        logger.info(f"处理结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        logger.info(f"='='='='='='='='='='='='='='='='='='='='='='='='='='='='='")
+if __name__ == '__main__':
+    import argparse
+    """
+    Parse arguments
+    task: str, including 'extract_prompt'
+    input_tar_files: list of str, input tar files
+    input_tar_languages: list of str, input tar languages for each tar file, must be the same length as input_tar_files
+    max_duration: float, max duration of audio
+    num_samples: int, number of samples to extract
+    target_sr: int, target sample rate
+    output_dir: str, output directory
+    num_processes: int, number of processes to use
+    prompt_dir: str, prompt directory which contains prompt jsonl files and audio files
+    language: str, language, zh or en
+    cosy_model_dir: str, cosy model directory
+    device: str, cuda device used to extract speech tokens
+    jsonl_files: list of str, jsonl files
+    parquet_files: list of str, parquet files
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--task', type=str, help='task')
+    parser.add_argument('--input_tar_files', nargs='+', type=str, help='input tar files')
+    parser.add_argument('--input_tar_languages', nargs='+', type=str, help='input tar languages for each tar file')
+    parser.add_argument('--output_dir', type=str, help='output directory',required=True)
+    parser.add_argument('--max_duration', type=float, default=5, help='max duration of audio')
+    parser.add_argument('--num_samples', type=int, default=10, help='number of samples to extract')
+    parser.add_argument('--target_sr', type=int, default=16000, help='target sample rate')
+    parser.add_argument('--num_processes', type=int, default=1, help='number of processes to use')
+    parser.add_argument('--prompts_dir', type=str, help='prompt directory which contains prompt jsonl files and audio files')
+    parser.add_argument('--language', type=str, help='language')
+    parser.add_argument('--cosy_model_dir', type=str, help='cosy model directory')
+    parser.add_argument('--device', type=str, help='cuda device used to extract speech tokens')
+    parser.add_argument('--jsonl_files', nargs='+', type=str, help='jsonl files')
+    parser.add_argument('--parquet_files', nargs='+', type=str, help='parquet files')
+    parser.add_argument('--is_cross_lingual', action='store_true', help='is cross lingual')
+    parser.add_argument('--is_instructed', action='store_true', help='is instructed')
+    args = parser.parse_args()
+    task = args.task
+    if task == 'extract_prompt':
+        input_tar_files = args.input_tar_files
+        input_tar_languages = args.input_tar_languages
+        output_dir = args.output_dir
+        assert len(input_tar_files) == len(input_tar_languages), 'input_tar_files and input_tar_languages must have the same length'
+        extract_prompt(input_tar_files, input_tar_languages, args.max_duration, args.num_samples, args.target_sr, output_dir)
+    elif task == 'generate_speech_tokens':
+        prompts_dir = args.prompts_dir
+        language = args.language
+        cosy_model_dir = args.cosy_model_dir
+        jsonl_files = args.jsonl_files
+        parquet_files = args.parquet_files
+        device = args.device
+        is_cross_lingual = args.is_cross_lingual
+        is_instructed = args.is_instructed
+        # 使用单进程单线程版本替代多进程版本
+        generate_speech_tokens_single_process(
+            cosy_model_dir=cosy_model_dir,
+            prompts_dir=prompts_dir,
+            output_dir=args.output_dir,
+            language=language,
+            jsonl_files=jsonl_files,
+            parquet_files=parquet_files,
+            device=device,
+            is_cross_lingual=is_cross_lingual,
+            is_instructed=is_instructed,
+        )

eval/eval_seed_generate.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#Download the evaluation file from:https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit
+import os
+voice_engine = None
+def init_process_func(model_path,device):
+    global voice_engine
+    from cosyvoice.cli.cosyvoice import CosyVoice2
+    voice_engine = CosyVoice2(model_path,device=device,fp16=False,load_jit=False)
+    print(f'Finish loading cosyvoice model from {model_path} in process {os.getpid()}')
+def do_tts(ID,tts_text,prompt_text,prompt_audio_file,output_dir):
+    from cosyvoice.utils.file_utils import load_wav
+    import torchaudio
+    global voice_engine
+    try:
+        final_output_file = os.path.join(output_dir,f'{ID}.wav')
+        prompt_speech_16k = load_wav(prompt_audio_file, 16000)
+        for output in voice_engine.inference_zero_shot(tts_text,prompt_text, prompt_speech_16k, stream=False,speed=1):
+            torchaudio.save(final_output_file, output['tts_speech'], voice_engine.sample_rate)
+            break # only save the first output
+        print(f'TTS {tts_text} and Save to {final_output_file} at process {os.getpid()}')
+    except Exception as e:
+        print(f'Error: {e}')
+        print(f'Error processing {ID} at process {os.getpid()}')
+        import traceback
+        traceback.print_exc()
+        return
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--eval_dir", type=str, default='eval_data/seedtts_testset')
+    parser.add_argument("--language", type=str, default='zh',choices=['zh','en'])
+    parser.add_argument("--model_path", type=str, default='/home/yueyulin/models/CosyVoice2-0.5B_RWKV_1.5B/')
+    parser.add_argument("--device", type=str, default='cuda:0')
+    parser.add_argument("--num_processes", type=int, default=2)
+    parser.add_argument("--output_dir", type=str, default='generated')
+    parser.add_argument("--list_file", type=str, default='meta.lst')
+    args = parser.parse_args()
+    print(args)
+    output_dir = os.path.join(args.eval_dir,args.language,args.output_dir)
+    #first delete the output_dir
+    if os.path.exists(output_dir):
+        import shutil
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+    list_file = os.path.join(args.eval_dir,args.language,args.list_file)
+    with open(list_file) as f:
+        lines = f.readlines()
+    lines = [line.strip() for line in lines]
+    print(f'Processing {len(lines)} lines')
+    from multiprocessing import Pool
+    from functools import partial
+    import time
+    with Pool(args.num_processes,init_process_func,(args.model_path,args.device)) as p:
+        for line in lines:
+            # 10002287-00000095|在此奉劝大家别乱打美白针。|prompt-wavs/10002287-00000094.wav|简单地说，这相当于惠普把消费领域市场拱手相让了。
+            parts = line.split('|')
+            ID = parts[0]
+            tts_text = parts[3]
+            prompt_text = parts[1]
+            prompt_audio_file = os.path.join(args.eval_dir,args.language,parts[2])
+            p.apply_async(do_tts,(ID,tts_text,prompt_text,prompt_audio_file,output_dir))
+        p.close()
+        p.join()
+    print('All done')

gradio/tts_demo_page.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import tempfile
+import torch
+import torchaudio
+import gradio as gr
+from cosyvoice.cli.cosyvoice import CosyVoice2
+from cosyvoice.utils.file_utils import load_wav
+# 全局变量
+model_path = '/external_data/models/CosyVoice2-0.5B_RWKV_0.19B/'
+device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+# 在应用启动时初始化模型（全局共享）
+print("正在初始化 CosyVoice2 模型...")
+cosyvoice = CosyVoice2(model_path, device=device, fp16=True)
+# 预热模型
+cosyvoice.model.llm.dummy_forward()
+print("模型初始化完成！")
+def synthesize_speech(audio_file, prompt_text, tts_text):
+    """合成语音"""
+    global cosyvoice
+    if not audio_file or not prompt_text or not tts_text:
+        return None, "请提供所有必需的输入（提示音频、提示文本和要合成的文本）"
+    try:
+        # 加载提示音频
+        prompt_speech_16k = load_wav(audio_file, 16000)
+        # 执行推理
+        result = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=False)
+        # 获取合成的语音
+        output_speech = result[0]['tts_speech']
+        # 保存临时文件
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+        temp_file.close()
+        torchaudio.save(temp_file.name, output_speech, cosyvoice.sample_rate)
+        return temp_file.name, f"语音合成成功！"
+    except Exception as e:
+        return None, f"合成过程中出错：{str(e)}"
+# 创建 Gradio 界面
+with gr.Blocks(title="RWKV TTS 演示") as demo:
+    gr.Markdown("# RWKV 语音合成演示")
+    gr.Markdown("### 语音合成系统已准备就绪，可直接使用")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(type="filepath", label="上传提示音频文件（WAV 格式）")
+            prompt_text = gr.Textbox(label="提示文本（与提示音频对应的文字内容）", placeholder="例如：今天天气挺不错的。")
+            tts_text = gr.Textbox(label="要合成的文本", placeholder="例如：收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。")
+            synthesize_button = gr.Button("生成语音")
+        with gr.Column():
+            audio_output = gr.Audio(label="合成的语音")
+            output_message = gr.Textbox(label="状态信息")
+    synthesize_button.click(
+        fn=synthesize_speech,
+        inputs=[audio_input, prompt_text, tts_text],
+        outputs=[audio_output, output_message]
+    )
+    gr.Markdown("""
+    ## 使用说明
+    1. 上传一个WAV格式的提示音频文件
+    2. 输入与提示音频对应的文本内容
+    3. 输入希望合成的文本
+    4. 点击"生成语音"按钮进行语音合成
+    注意：模型已在服务启动时预加载，所有用户共享同一个模型实例。
+    """)
+# 启动应用
+if __name__ == "__main__":
+    demo.launch()

mine.wav ADDED Viewed

Binary file (97 kB). View file

new.mp3 ADDED Viewed

Binary file (25.7 kB). View file

new.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e62a130a15a7560ebf8c1bd73212a9d6410a50e595de9a809bc64775a4a6f07
+size 141964

run_multiple_process.sh ADDED Viewed

	@@ -0,0 +1,137 @@

+export PYTHONPATH=/home/yueyulin/github/CosyVoice:/home/yueyulin/github/CosyVoice/third_party/Matcha-TTS/:/home/yueyulin/github/RWKVTTS
+# 设置默认参数
+LANGUAGE="zh"
+OUTPUT_DIR="/home/yueyulin/data/speech_corpus"
+COSY_MODEL_DIR="/home/yueyulin/models/CosyVoice2-0.5B/"
+PROMPTS_DIR="extract_data/prompts/zh"
+DEVICE="cuda:0"
+PARQUET_FILES=()
+JSONL_FILES=()
+FILE_TYPE="" # 用于标记文件类型
+is_cross_lingual=""
+is_instructed=""
+# 解析命令行参数
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --language)
+      LANGUAGE="$2"
+      shift 2
+      ;;
+    --output_dir)
+      OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    --cosy_model_dir)
+      COSY_MODEL_DIR="$2"
+      shift 2
+      ;;
+    --prompts_dir)
+      PROMPTS_DIR="$2"
+      shift 2
+      ;;
+    --parquet_files)
+      # 接收多个parquet文件路径
+      shift
+      while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
+        PARQUET_FILES+=("$1")
+        shift
+      done
+      FILE_TYPE="parquet"
+      ;;
+    --jsonl_files)
+      # 接收多个jsonl文件路径
+      shift
+      while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
+        JSONL_FILES+=("$1")
+        shift
+      done
+      FILE_TYPE="jsonl"
+      ;;
+    --device)
+      DEVICE="$2"
+      shift 2
+      ;;
+    --cross_lingual)
+      is_cross_lingual="--is_cross_lingual"
+      shift
+      ;;
+    --instructed)
+      is_instructed="--is_instructed"
+      shift
+      ;;
+    *)
+      echo "未知参数: $1"
+      exit 1
+      ;;
+  esac
+done
+# 检查是否提供了文件
+if [ "$FILE_TYPE" == "parquet" ]; then
+  if [ ${#PARQUET_FILES[@]} -eq 0 ]; then
+    echo "错误: 未指定parquet文件，请使用 --parquet_files 参数"
+    exit 1
+  fi
+  FILES=("${PARQUET_FILES[@]}")
+  FILE_ARG="--parquet_files"
+  echo "将处理 ${#FILES[@]} 个parquet文件"
+elif [ "$FILE_TYPE" == "jsonl" ]; then
+  if [ ${#JSONL_FILES[@]} -eq 0 ]; then
+    echo "错误: 未指定jsonl文件，请使用 --jsonl_files 参数"
+    exit 1
+  fi
+  FILES=("${JSONL_FILES[@]}")
+  FILE_ARG="--jsonl_files"
+  echo "将处理 ${#FILES[@]} 个jsonl文件"
+else
+  echo "错误: 请使用 --parquet_files 或 --jsonl_files 参数指定输入文件"
+  exit 1
+fi
+echo "运行参数:"
+echo "语言: $LANGUAGE"
+echo "输出目录: $OUTPUT_DIR"
+echo "模型目录: $COSY_MODEL_DIR"
+echo "提示词目录: $PROMPTS_DIR"
+echo "设备: $DEVICE"
+echo "文件类型: $FILE_TYPE"
+# 确保输出目录存在
+mkdir -p $OUTPUT_DIR
+# 启动处理进程，每个文件一个进程
+for ((i=0; i<${#FILES[@]}; i++)); do
+  FILE="${FILES[$i]}"
+  FILENAME=$(basename "$FILE")
+  echo "处理文件 $FILENAME 使用 $DEVICE"
+  # 在后台启动进程
+  nohup python data/utils/utilitie.py \
+    --task generate_speech_tokens \
+    --language $LANGUAGE \
+    $is_cross_lingual \
+    $FILE_ARG "$FILE" \
+    --output_dir $OUTPUT_DIR \
+    --cosy_model_dir $COSY_MODEL_DIR \
+    --prompts_dir $PROMPTS_DIR \
+    $is_instructed \
+    --device "$DEVICE" > "$OUTPUT_DIR/log_${FILENAME%.*}.log" 2>&1 &
+  # 记录进程ID
+  PID=$!
+  echo "启动进程 PID: $PID 处理文件: $FILENAME 使用 $DEVICE"
+  # 等待一点时间确保进程启动
+  sleep 5
+done
+echo "所有处理进程已启动，日志文件保存在 $OUTPUT_DIR 目录"
+echo "使用 'ps aux | grep utilitie.py' 命令查看运行状态"
+echo "使用 'nvidia-smi' 命令监控GPU使用情况"
+# 等待所有后台进程完成
+wait
+echo "所有处理已完成"

rwkvtts_requirements.txt ADDED Viewed

	@@ -0,0 +1,264 @@

+absl-py==2.1.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.8
+aiohttp==3.11.13
+aiosignal==1.3.2
+alembic==1.15.1
+altair==5.5.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.4
+attrs==25.1.0
+audioread==3.0.1
+autopage==0.5.2
+babel==2.17.0
+beautifulsoup4==4.13.3
+bleach==6.2.0
+certifi==2025.1.31
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.1
+click==8.1.8
+cliff==4.9.1
+cmaes==0.11.1
+cmd2==2.5.11
+colorama==0.4.6
+coloredlogs==15.0.1
+colorlog==6.9.0
+comm==0.2.2
+conformer==0.3.2
+contourpy==1.3.1
+csvw==3.5.1
+cycler==0.12.1
+Cython==3.0.12
+datasets==3.3.2
+debugpy==1.8.13
+decorator==5.2.1
+deepspeed==0.16.4
+defusedxml==0.7.1
+diffusers==0.32.2
+dill==0.3.8
+distlib==0.3.9
+dlinfo==2.0.0
+einops==0.8.1
+executing==2.2.0
+fastapi==0.115.11
+fastjsonschema==2.21.1
+ffmpy==0.5.0
+filelock==3.17.0
+flatbuffers==25.2.10
+fonttools==4.56.0
+fqdn==1.5.1
+frozenlist==1.5.0
+fsspec==2024.12.0
+gdown==5.2.0
+gradio==3.43.2
+gradio_client==0.5.0
+greenlet==3.1.1
+grpcio==1.70.0
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.29.1
+humanfriendly==10.0
+hydra-colorlog==1.2.0
+hydra-core==1.3.2
+hydra-optuna-sweeper==1.2.0
+HyperPyYAML==1.2.2
+identify==2.6.8
+idna==3.10
+importlib_metadata==8.6.1
+importlib_resources==6.5.2
+inflect==7.5.0
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==9.0.1
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.5
+isodate==0.7.2
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.5
+joblib==1.4.2
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+kiwisolver==1.4.8
+language-tags==1.2.0
+lazy_loader==0.4
+librosa==0.10.2.post1
+lightning==2.5.0.post0
+lightning-utilities==0.13.1
+llvmlite==0.44.0
+Mako==1.3.9
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matcha-tts==0.0.7.2
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.2
+modelscope==1.23.2
+more-itertools==10.6.0
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+narwhals==1.29.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+ninja==1.11.1.3
+nodeenv==1.9.1
+notebook==7.3.2
+notebook_shim==0.2.4
+numba==0.61.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+onnx==1.17.0
+onnxruntime-gpu==1.20.1
+openai-whisper==20240930
+optuna==2.10.1
+orjson==3.10.15
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pbr==6.1.1
+pexpect==4.9.0
+phonemizer==3.3.0
+pillow==10.4.0
+platformdirs==4.3.6
+pluggy==1.5.0
+pooch==1.8.2
+pre_commit==4.1.0
+prettytable==3.15.1
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+propcache==0.3.0
+protobuf==6.30.0
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pyarrow==19.0.1
+pycparser==2.22
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.1
+pyperclip==1.9.0
+PySocks==1.7.1
+pytest==8.3.5
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==3.2.1
+python-multipart==0.0.20
+pytorch-lightning==2.5.0.post0
+pytz==2025.1
+pyworld==0.3.5
+PyYAML==6.0.2
+pyzmq==26.2.1
+rdflib==7.1.3
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986==1.5.0
+rfc3986-validator==0.1.1
+rich==13.9.4
+rootutils==1.0.7
+rpds-py==0.23.1
+ruamel.yaml==0.18.10
+ruamel.yaml.clib==0.2.12
+rwkv-fla==0.7.202503020902
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+seaborn==0.13.2
+segments==2.3.0
+semantic-version==2.10.0
+Send2Trash==1.8.3
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soupsieve==2.6
+soxr==0.5.0.post1
+SQLAlchemy==2.0.38
+stack-data==0.6.3
+starlette==0.46.0
+stevedore==5.4.1
+sympy==1.13.1
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+terminado==0.18.1
+threadpoolctl==3.5.0
+tiktoken==0.9.0
+tinycss2==1.4.0
+tokenizers==0.21.0
+torch==2.6.0
+torchaudio==2.6.0
+torchmetrics==1.6.2
+torchvision==0.21.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.49.0
+triton==3.2.0
+typeguard==4.4.2
+types-python-dateutil==2.9.0.20241206
+typing_extensions==4.12.2
+tzdata==2025.1
+Unidecode==1.3.8
+uri-template==1.3.0
+uritemplate==4.1.1
+urllib3==2.3.0
+uvicorn==0.34.0
+virtualenv==20.29.2
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==11.0.3
+Werkzeug==3.1.3
+WeTextProcessing==1.0.4.1
+wget==3.2
+widgetsnbextension==4.0.13
+xxhash==3.5.0
+yarl==1.18.3
+zipp==3.21.0

third_party/cosyvoice/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import pyarrow.parquet as pq
+from io import BytesIO
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import pyworld as pw
+AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+def parquet_opener(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        url = sample['src']
+        try:
+            for df in pq.ParquetFile(url).iter_batches(batch_size=64):
+                df = df.to_pandas()
+                for i in range(len(df)):
+                    if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
+                        continue
+                    sample.update(dict(df.loc[i]))
+                    if mode == 'train':
+                        # NOTE do not return sample directly, must initialize a new dict
+                        yield {**sample}
+                    else:
+                        for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
+                            yield {**sample, 'tts_index': index, 'tts_text': text}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
+        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+        del sample['audio_data']
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['text_token']) < token_min_length:
+            continue
+        if len(sample['text_token']) > token_max_length:
+            continue
+        if len(sample['speech_token']) == 0:
+            continue
+        if num_frames != 0:
+            if len(sample['text_token']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['text_token']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+    """ Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['speech']
+        if sample_rate != resample_rate:
+            if sample_rate < min_sample_rate:
+                continue
+            sample['sample_rate'] = resample_rate
+            sample['speech'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        max_val = sample['speech'].abs().max()
+        if max_val > 1:
+            sample['speech'] /= max_val
+        yield sample
+def truncate(data, truncate_length=24576, mode='train'):
+    """ Truncate data.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            truncate_length: truncate length
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        waveform = sample['speech']
+        if waveform.shape[1] > truncate_length:
+            start = random.randint(0, waveform.shape[1] - truncate_length)
+            waveform = waveform[:, start: start + truncate_length]
+        else:
+            waveform = torch.concat([waveform, torch.zeros(1, truncate_length - waveform.shape[1])], dim=1)
+        sample['speech'] = waveform
+        yield sample
+def compute_fbank(data,
+                  feat_extractor,
+                  mode='train'):
+    """ Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        sample['speech_feat'] = mat
+        yield sample
+def compute_f0(data, sample_rate, hop_size, mode='train'):
+    """ Extract f0
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    frame_period = hop_size * 1000 / sample_rate
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        _f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
+        if sum(_f0 != 0) < 5: # this happens when the algorithm fails
+            _f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
+        f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
+        f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
+        sample['pitch_feat'] = f0
+        yield sample
+def parse_embedding(data, normalize, mode='train'):
+    """ Parse utt_embedding/spk_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
+        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
+        if normalize:
+            sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
+            sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
+        yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+    """ Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    tokenizer = get_tokenizer()
+    for sample in data:
+        assert 'text' in sample
+        sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
+        if mode == 'inference':
+            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
+        yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+    """ Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+def sort(data, sort_size=500, mode='train'):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['speech_feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['speech_feat'].size(0))
+    for x in buf:
+        yield x
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'speech_feat' in sample
+        assert isinstance(sample['speech_feat'], torch.Tensor)
+        new_sample_frames = sample['speech_feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
+    """ Wrapper for static/dynamic batch
+    """
+    if mode == 'inference':
+        return static_batch(data, 1)
+    else:
+        if batch_type == 'static':
+            return static_batch(data, batch_size)
+        elif batch_type == 'dynamic':
+            return dynamic_batch(data, max_frames_in_batch)
+        else:
+            logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, use_spk_embedding, mode='train', gan=False):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        utts = [sample[i]['utt'] for i in order]
+        speech = [sample[i]['speech'].squeeze(dim=0) for i in order]
+        speech_len = torch.tensor([i.size(0) for i in speech], dtype=torch.int32)
+        speech = pad_sequence(speech, batch_first=True, padding_value=0)
+        speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+        speech_token = pad_sequence(speech_token,
+                                    batch_first=True,
+                                    padding_value=0)
+        speech_feat = [sample[i]['speech_feat'] for i in order]
+        speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+        speech_feat = pad_sequence(speech_feat,
+                                   batch_first=True,
+                                   padding_value=0)
+        text = [sample[i]['text'] for i in order]
+        text_token = [torch.tensor(sample[i]['text_token']) for i in order]
+        text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
+        text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
+        utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
+        spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
+        batch = {
+            "utts": utts,
+            "speech": speech,
+            "speech_len": speech_len,
+            "speech_token": speech_token,
+            "speech_token_len": speech_token_len,
+            "speech_feat": speech_feat,
+            "speech_feat_len": speech_feat_len,
+            "text": text,
+            "text_token": text_token,
+            "text_token_len": text_token_len,
+            "utt_embedding": utt_embedding,
+            "spk_embedding": spk_embedding,
+        }
+        if gan is True:
+            # in gan train, we need pitch_feat
+            pitch_feat = [sample[i]['pitch_feat'] for i in order]
+            pitch_feat_len = torch.tensor([i.size(0) for i in pitch_feat], dtype=torch.int32)
+            pitch_feat = pad_sequence(pitch_feat,
+                                      batch_first=True,
+                                      padding_value=0)
+            batch["pitch_feat"] = pitch_feat
+            batch["pitch_feat_len"] = pitch_feat_len
+        else:
+            # only gan train needs speech, delete it to save memory
+            del batch["speech"]
+            del batch["speech_len"]
+        if mode == 'inference':
+            tts_text = [sample[i]['tts_text'] for i in order]
+            tts_index = [sample[i]['tts_index'] for i in order]
+            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
+            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
+            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
+            batch.update({'tts_text': tts_text,
+                          'tts_index': tts_index,
+                          'tts_text_token': tts_text_token,
+                          'tts_text_token_len': tts_text_token_len})
+        if use_spk_embedding is True:
+            batch["embedding"] = batch["spk_embedding"]
+        else:
+            batch["embedding"] = batch["utt_embedding"]
+        yield batch

third_party/cosyvoice/flow/decoder.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import pack, rearrange, repeat
+from cosyvoice.utils.common import mask_to_bias
+from cosyvoice.utils.mask import add_optional_chunk_mask
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from matcha.models.components.transformer import BasicTransformerBlock
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        causal=False,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.causal = causal
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else
+                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            ) if self.causal else ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask

third_party/cosyvoice/flow/flow.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from omegaconf import DictConfig
+from cosyvoice.utils.mask import make_pad_mask
+class MaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 length_regulator: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        token = batch['speech_token'].to(device)
+        token_len = batch['speech_token_len'].to(device)
+        feat = batch['speech_feat'].to(device)
+        feat_len = batch['speech_feat_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        h, h_lengths = self.length_regulator(h, feat_len)
+        # get conditions
+        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(feat_len)).to(h)
+        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  flow_cache):
+        if self.fp16 is True:
+            prompt_feat = prompt_feat.half()
+            embedding = embedding.half()
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
+        h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, flow_cache = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            prompt_len=mel_len1,
+            flow_cache=flow_cache
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), flow_cache
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 token_mel_ratio: int = 2,
+                 pre_lookahead_len: int = 3,
+                 encoder: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  finalize):
+        if self.fp16 is True:
+            prompt_feat = prompt_feat.half()
+            embedding = embedding.half()
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        if finalize is False:
+            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None

third_party/cosyvoice/flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import threading
+import torch
+import torch.nn.functional as F
+from matcha.models.components.flow_matching import BASECFM
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+        self.lock = threading.Lock()
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        cache_size = flow_cache.shape[2]
+        # fix prompt and overlap part mu and z
+        if cache_size != 0:
+            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
+            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
+        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
+        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
+        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        else:
+            with self.lock:
+                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('t', (2,))
+                self.estimator.set_input_shape('spks', (2, 80))
+                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                # run trt engine
+                self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                           mask.contiguous().data_ptr(),
+                                           mu.contiguous().data_ptr(),
+                                           t.contiguous().data_ptr(),
+                                           spks.contiguous().data_ptr(),
+                                           cond.contiguous().data_ptr(),
+                                           x.data_ptr()])
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            spks = spks * cfg_mask.view(-1, 1)
+            cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
+        self.rand_noise = torch.randn([1, 80, 50 * 300])
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None

third_party/cosyvoice/flow/length_regulator.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch.nn as nn
+import torch
+from torch.nn import functional as F
+from cosyvoice.utils.mask import make_pad_mask
+class InterpolateRegulator(nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            sampling_ratios: Tuple,
+            out_channels: int = None,
+            groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = nn.ModuleList([])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = nn.GroupNorm(groups, channels)
+                act = nn.Mish()
+                model.extend([module, norm, act])
+        model.append(
+            nn.Conv1d(channels, out_channels, 1, 1)
+        )
+        self.model = nn.Sequential(*model)
+    def forward(self, x, ylens=None):
+        # x in (B, T, D)
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2