tiamojames commited on
Commit
bcff6dc
·
verified ·
1 Parent(s): a9b613d

Upload folder using huggingface_hub

Browse files
Files changed (32) hide show
  1. app.py +1 -1
  2. soulxpodcast/__pycache__/config.cpython-311.pyc +0 -0
  3. soulxpodcast/config.py +10 -9
  4. soulxpodcast/engine/__pycache__/__init__.cpython-311.pyc +0 -0
  5. soulxpodcast/engine/__pycache__/llm_engine.cpython-311.pyc +0 -0
  6. soulxpodcast/engine/llm_engine.py +4 -6
  7. soulxpodcast/models/__pycache__/soulxpodcast.cpython-311.pyc +0 -0
  8. soulxpodcast/models/modules/__pycache__/__init__.cpython-311.pyc +0 -0
  9. soulxpodcast/models/modules/__pycache__/flow.cpython-311.pyc +0 -0
  10. soulxpodcast/models/modules/__pycache__/hifigan.cpython-311.pyc +0 -0
  11. soulxpodcast/models/modules/__pycache__/sampler.cpython-311.pyc +0 -0
  12. soulxpodcast/models/modules/flow_components/__pycache__/__init__.cpython-311.pyc +0 -0
  13. soulxpodcast/models/modules/flow_components/__pycache__/estimator.cpython-311.pyc +0 -0
  14. soulxpodcast/models/modules/flow_components/__pycache__/upsample_encoder.cpython-311.pyc +0 -0
  15. soulxpodcast/models/modules/flow_components/upsample_encoder.py +1 -2
  16. soulxpodcast/models/modules/hifigan.py +1 -1
  17. soulxpodcast/models/modules/hifigan_components/__pycache__/__init__.cpython-311.pyc +0 -0
  18. soulxpodcast/models/modules/hifigan_components/__pycache__/layers.cpython-311.pyc +0 -0
  19. soulxpodcast/models/soulxpodcast.py +13 -37
  20. soulxpodcast/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  21. soulxpodcast/utils/__pycache__/audio.cpython-311.pyc +0 -0
  22. soulxpodcast/utils/__pycache__/commons.cpython-311.pyc +0 -0
  23. soulxpodcast/utils/__pycache__/dataloader.cpython-311.pyc +0 -0
  24. soulxpodcast/utils/__pycache__/infer_utils.cpython-311.pyc +0 -0
  25. soulxpodcast/utils/__pycache__/parser.cpython-311.pyc +0 -0
  26. soulxpodcast/utils/__pycache__/text.cpython-311.pyc +0 -0
  27. soulxpodcast/utils/audio.py +47 -1
  28. soulxpodcast/utils/commons.py +10 -0
  29. soulxpodcast/utils/dataloader.py +32 -21
  30. soulxpodcast/utils/infer_utils.py +95 -0
  31. soulxpodcast/utils/parser.py +87 -0
  32. soulxpodcast/utils/text.py +40 -1
app.py CHANGED
@@ -248,7 +248,7 @@ def process_single(target_text_list, prompt_wav_list, prompt_text_list, use_dial
248
  text, spk = match.group(2), int(match.group(1)[2])-1
249
  spks.append(spk)
250
  texts.append(text)
251
-
252
  global dataset
253
  dataitem = {"key": "001", "prompt_text": prompt_text_list, "prompt_wav": prompt_wav_list,
254
  "text": texts, "spk": spks, }
 
248
  text, spk = match.group(2), int(match.group(1)[2])-1
249
  spks.append(spk)
250
  texts.append(text)
251
+ import pdb;pdb.set_trace()
252
  global dataset
253
  dataitem = {"key": "001", "prompt_text": prompt_text_list, "prompt_wav": prompt_wav_list,
254
  "text": texts, "spk": spks, }
soulxpodcast/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/__pycache__/config.cpython-311.pyc and b/soulxpodcast/__pycache__/config.cpython-311.pyc differ
 
soulxpodcast/config.py CHANGED
@@ -8,6 +8,7 @@ import torch
8
  from transformers import AutoConfig
9
  from transformers import PretrainedConfig
10
 
 
11
  @dataclass
12
  class SoulXPodcastLLMConfig:
13
  architectures: list[str] = field(default_factory=lambda: ["Qwen3ForCausalLM"])
@@ -47,11 +48,11 @@ class SoulXPodcastLLMConfig:
47
  json_file: Optional[str] = None
48
  ):
49
  """
50
- Create instance from initial values and JSON data
51
 
52
  Args:
53
- initial_values: Initial key-value dict, which will overrides all other configurations
54
- json_file: JSON file path
55
 
56
  Returns:
57
  SoulXPodcastLLMConfig instance
@@ -76,7 +77,7 @@ class SoulXPodcastLLMConfig:
76
 
77
  @staticmethod
78
  def _load_json_file(file_path: str) -> Dict[str, Any]:
79
- """JSON文件加载数据"""
80
  path = Path(file_path)
81
  if not path.exists():
82
  return {}
@@ -94,7 +95,7 @@ class AutoPretrainedConfig(PretrainedConfig):
94
 
95
  @classmethod
96
  def from_dataclass(cls, dataclass_config):
97
- """Dynamically generate config from dataclass"""
98
  if not is_dataclass(dataclass_config):
99
  raise ValueError("Input must be a dataclass instance")
100
 
@@ -108,8 +109,8 @@ class SamplingParams:
108
  repetition_penalty: float = 1.25
109
  top_k: int = 100
110
  top_p: float = 0.9
111
- max_tokens: int = 3000
112
  min_tokens: int = 8
 
113
  stop_token_ids: list[int] = field(default_factory=lambda: [151675])
114
 
115
  use_ras: bool = True
@@ -127,12 +128,12 @@ class Config:
127
  hf_config: SoulXPodcastLLMConfig | AutoConfig = field(default_factory=SoulXPodcastLLMConfig)
128
  eos: int = -1
129
  llm_engine: str = "hf"
130
- max_turn_size: int = 14
131
  turn_tokens_threshold: int = 6192
132
 
133
  prompt_context: int = 2
134
- history_context: int = 4
135
- history_text_context: int = 4
136
 
137
  def __post_init__(self):
138
  assert os.path.isdir(self.model)
 
8
  from transformers import AutoConfig
9
  from transformers import PretrainedConfig
10
 
11
+
12
  @dataclass
13
  class SoulXPodcastLLMConfig:
14
  architectures: list[str] = field(default_factory=lambda: ["Qwen3ForCausalLM"])
 
48
  json_file: Optional[str] = None
49
  ):
50
  """
51
+ Create an instance from initial values and JSON data.
52
 
53
  Args:
54
+ initial_values: Dictionary of initial values (highest priority)
55
+ json_file: Path to JSON file
56
 
57
  Returns:
58
  SoulXPodcastLLMConfig instance
 
77
 
78
  @staticmethod
79
  def _load_json_file(file_path: str) -> Dict[str, Any]:
80
+ """Load data from a JSON file"""
81
  path = Path(file_path)
82
  if not path.exists():
83
  return {}
 
95
 
96
  @classmethod
97
  def from_dataclass(cls, dataclass_config):
98
+ """Automatically create configuration from any dataclass"""
99
  if not is_dataclass(dataclass_config):
100
  raise ValueError("Input must be a dataclass instance")
101
 
 
109
  repetition_penalty: float = 1.25
110
  top_k: int = 100
111
  top_p: float = 0.9
 
112
  min_tokens: int = 8
113
+ max_tokens: int = 3000
114
  stop_token_ids: list[int] = field(default_factory=lambda: [151675])
115
 
116
  use_ras: bool = True
 
128
  hf_config: SoulXPodcastLLMConfig | AutoConfig = field(default_factory=SoulXPodcastLLMConfig)
129
  eos: int = -1
130
  llm_engine: str = "hf"
131
+ max_turn_size: int = 10
132
  turn_tokens_threshold: int = 6192
133
 
134
  prompt_context: int = 2
135
+ history_context: int = 2
136
+ history_text_context: int = 2
137
 
138
  def __post_init__(self):
139
  assert os.path.isdir(self.model)
soulxpodcast/engine/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/engine/__pycache__/__init__.cpython-311.pyc and b/soulxpodcast/engine/__pycache__/__init__.cpython-311.pyc differ
 
soulxpodcast/engine/__pycache__/llm_engine.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/engine/__pycache__/llm_engine.cpython-311.pyc and b/soulxpodcast/engine/__pycache__/llm_engine.cpython-311.pyc differ
 
soulxpodcast/engine/llm_engine.py CHANGED
@@ -1,9 +1,9 @@
 
1
  import types
2
  import atexit
3
- from dataclasses import fields, asdict
4
  from time import perf_counter
5
- import os
6
  from functools import partial
 
7
 
8
  import torch
9
  import torch.multiprocessing as mp
@@ -17,8 +17,8 @@ try:
17
  except ImportError:
18
  SUPPORT_VLLM = False
19
 
20
- from soulxpodcast.models.modules.sampler import _ras_sample_hf_engine
21
  from soulxpodcast.config import Config, SamplingParams
 
22
 
23
  class HFLLMEngine:
24
 
@@ -41,7 +41,6 @@ class HFLLMEngine:
41
  past_key_values=None,
42
  ) -> dict:
43
 
44
-
45
  stopping_criteria = StoppingCriteriaList([EosTokenCriteria(eos_token_id=self.config.hf_config.eos_token_id)])
46
  if sampling_param.use_ras:
47
  sample_hf_engine_handler = partial(_ras_sample_hf_engine,
@@ -63,7 +62,6 @@ class HFLLMEngine:
63
  min_new_tokens=sampling_param.min_tokens,
64
  max_new_tokens=sampling_param.max_tokens,
65
  temperature=sampling_param.temperature,
66
- repetition_penalty=sampling_param.repetition_penalty,
67
  stopping_criteria=stopping_criteria,
68
  past_key_values=past_key_values,
69
  custom_generate=sample_hf_engine_handler,
@@ -90,7 +88,7 @@ class VLLMEngine:
90
  self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
91
  os.environ["VLLM_USE_V1"] = "0"
92
  if SUPPORT_VLLM:
93
- self.model = LLM(model=model, enforce_eager=True, dtype="bfloat16", max_model_len=8192, enable_prefix_caching=True)
94
  else:
95
  raise ImportError("Not Support VLLM now!!!")
96
  self.config = config
 
1
+ import os
2
  import types
3
  import atexit
 
4
  from time import perf_counter
 
5
  from functools import partial
6
+ from dataclasses import fields, asdict
7
 
8
  import torch
9
  import torch.multiprocessing as mp
 
17
  except ImportError:
18
  SUPPORT_VLLM = False
19
 
 
20
  from soulxpodcast.config import Config, SamplingParams
21
+ from soulxpodcast.models.modules.sampler import _ras_sample_hf_engine
22
 
23
  class HFLLMEngine:
24
 
 
41
  past_key_values=None,
42
  ) -> dict:
43
 
 
44
  stopping_criteria = StoppingCriteriaList([EosTokenCriteria(eos_token_id=self.config.hf_config.eos_token_id)])
45
  if sampling_param.use_ras:
46
  sample_hf_engine_handler = partial(_ras_sample_hf_engine,
 
62
  min_new_tokens=sampling_param.min_tokens,
63
  max_new_tokens=sampling_param.max_tokens,
64
  temperature=sampling_param.temperature,
 
65
  stopping_criteria=stopping_criteria,
66
  past_key_values=past_key_values,
67
  custom_generate=sample_hf_engine_handler,
 
88
  self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
89
  os.environ["VLLM_USE_V1"] = "0"
90
  if SUPPORT_VLLM:
91
+ self.model = LLM(model=model, enforce_eager=True, dtype="bfloat16", max_model_len=8192)
92
  else:
93
  raise ImportError("Not Support VLLM now!!!")
94
  self.config = config
soulxpodcast/models/__pycache__/soulxpodcast.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/__pycache__/soulxpodcast.cpython-311.pyc and b/soulxpodcast/models/__pycache__/soulxpodcast.cpython-311.pyc differ
 
soulxpodcast/models/modules/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/__pycache__/__init__.cpython-311.pyc and b/soulxpodcast/models/modules/__pycache__/__init__.cpython-311.pyc differ
 
soulxpodcast/models/modules/__pycache__/flow.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/__pycache__/flow.cpython-311.pyc and b/soulxpodcast/models/modules/__pycache__/flow.cpython-311.pyc differ
 
soulxpodcast/models/modules/__pycache__/hifigan.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/__pycache__/hifigan.cpython-311.pyc and b/soulxpodcast/models/modules/__pycache__/hifigan.cpython-311.pyc differ
 
soulxpodcast/models/modules/__pycache__/sampler.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/__pycache__/sampler.cpython-311.pyc and b/soulxpodcast/models/modules/__pycache__/sampler.cpython-311.pyc differ
 
soulxpodcast/models/modules/flow_components/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/flow_components/__pycache__/__init__.cpython-311.pyc and b/soulxpodcast/models/modules/flow_components/__pycache__/__init__.cpython-311.pyc differ
 
soulxpodcast/models/modules/flow_components/__pycache__/estimator.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/flow_components/__pycache__/estimator.cpython-311.pyc and b/soulxpodcast/models/modules/flow_components/__pycache__/estimator.cpython-311.pyc differ
 
soulxpodcast/models/modules/flow_components/__pycache__/upsample_encoder.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/flow_components/__pycache__/upsample_encoder.cpython-311.pyc and b/soulxpodcast/models/modules/flow_components/__pycache__/upsample_encoder.cpython-311.pyc differ
 
soulxpodcast/models/modules/flow_components/upsample_encoder.py CHANGED
@@ -493,8 +493,7 @@ class MultiHeadedAttention(nn.Module):
493
  the mask is in (#batch, L, L) shape.
494
  4.If the different position in decoder see different block
495
  of the encoder, such as Mocha, the passed in mask could be
496
- in (#batch, L, T) shape. But there is no such case in current
497
- CosyVoice.
498
  cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
499
  where `cache_t == chunk_size * num_decoding_left_chunks`
500
  and `head * d_k == size`
 
493
  the mask is in (#batch, L, L) shape.
494
  4.If the different position in decoder see different block
495
  of the encoder, such as Mocha, the passed in mask could be
496
+ in (#batch, L, T) shape.
 
497
  cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
498
  where `cache_t == chunk_size * num_decoding_left_chunks`
499
  and `head * d_k == size`
soulxpodcast/models/modules/hifigan.py CHANGED
@@ -231,7 +231,7 @@ class HiFTGenerator(nn.Module):
231
  phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])
232
 
233
  x = self._istft(magnitude, phase)
234
- x = torch.clamp(x*0.98, -self.audio_limit, self.audio_limit)
235
  return x
236
 
237
  @torch.inference_mode()
 
231
  phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])
232
 
233
  x = self._istft(magnitude, phase)
234
+ x = torch.clamp(x, -self.audio_limit, self.audio_limit)
235
  return x
236
 
237
  @torch.inference_mode()
soulxpodcast/models/modules/hifigan_components/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/hifigan_components/__pycache__/__init__.cpython-311.pyc and b/soulxpodcast/models/modules/hifigan_components/__pycache__/__init__.cpython-311.pyc differ
 
soulxpodcast/models/modules/hifigan_components/__pycache__/layers.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/models/modules/hifigan_components/__pycache__/layers.cpython-311.pyc and b/soulxpodcast/models/modules/hifigan_components/__pycache__/layers.cpython-311.pyc differ
 
soulxpodcast/models/soulxpodcast.py CHANGED
@@ -1,7 +1,8 @@
1
  import time
2
  from datetime import datetime
3
- from itertools import chain
4
  from tqdm import tqdm
 
5
  from copy import deepcopy
6
 
7
  import numpy as np
@@ -55,11 +56,12 @@ class SoulXPodcast(torch.nn.Module):
55
  spk_emb_for_flow: torch.Tensor,
56
  sampling_params: SamplingParams | list[SamplingParams],
57
  spk_ids: list[list[int]],
58
- use_prompt_cot: bool = False,
59
- prompt_cot_text_tokens_for_llm: list[list[int]] = None,
60
- prompt_cot_prefix: list[list[int]] = None,
61
  **kwargs,
62
  ):
 
63
  prompt_size, turn_size = len(prompt_mels_for_llm), len(text_tokens_for_llm)
64
 
65
 
@@ -91,37 +93,16 @@ class SoulXPodcast(torch.nn.Module):
91
  prompt_inputs = []
92
  history_inputs = []
93
 
94
-
95
-
96
-
97
-
98
-
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
-
109
-
110
-
111
-
112
-
113
-
114
-
115
  for i in range(prompt_size):
116
  speech_tokens_i = [token+self.config.hf_config.speech_token_offset for token in prompt_speech_tokens[i].tolist()]
117
  speech_tokens_i += [self.config.hf_config.eos_token_id]
118
- if use_prompt_cot and len(prompt_cot_text_tokens_for_llm[i])>0:
119
- prompt_cot_input = prompt_text_tokens_for_llm[i] + speech_tokens_i + prompt_cot_text_tokens_for_llm[i]
120
  if i>0:
121
- prompt_cot_input = prompt_cot_prefix[0] + prompt_cot_input
122
- cot_input = self.llm.generate(prompt_cot_input, sampling_params, past_key_values=None)['token_ids']
123
- prompt_inputs.append(prompt_cot_prefix[i+1]+prompt_cot_text_tokens_for_llm[i] + cot_input)
124
- history_inputs.append(prompt_cot_prefix[i+1]+prompt_cot_text_tokens_for_llm[i] + cot_input)
125
  else:
126
  prompt_inputs.append(prompt_text_tokens_for_llm[i] + speech_tokens_i )
127
  history_inputs.append(prompt_text_tokens_for_llm[i] + speech_tokens_i )
@@ -164,7 +145,6 @@ class SoulXPodcast(torch.nn.Module):
164
  flow_inputs_len = torch.tensor([len(prompt_speech_token) + len(generated_speech_tokens)])
165
 
166
 
167
-
168
  start_idx = spk_ids[i]
169
  prompt_mels = prompt_mels_for_flow[start_idx][None]
170
  prompt_mels_lens = prompt_mels_lens_for_flow[start_idx][None]
@@ -180,11 +160,7 @@ class SoulXPodcast(torch.nn.Module):
180
 
181
 
182
  mel = generated_mels[:, :, prompt_mels_lens[0].item():generated_mels_lens[0].item()]
183
- try:
184
- wav, _ = self.hift(speech_feat=mel)
185
- except Exception as e:
186
- import pdb;pdb.set_trace()
187
- print(e)
188
  generated_wavs.append(wav)
189
 
190
 
 
1
  import time
2
  from datetime import datetime
3
+
4
  from tqdm import tqdm
5
+ from itertools import chain
6
  from copy import deepcopy
7
 
8
  import numpy as np
 
56
  spk_emb_for_flow: torch.Tensor,
57
  sampling_params: SamplingParams | list[SamplingParams],
58
  spk_ids: list[list[int]],
59
+ use_dialect_prompt: bool = False,
60
+ dialect_prompt_text_tokens_for_llm: list[list[int]] = None,
61
+ dialect_prefix: list[list[int]] = None,
62
  **kwargs,
63
  ):
64
+
65
  prompt_size, turn_size = len(prompt_mels_for_llm), len(text_tokens_for_llm)
66
 
67
 
 
93
  prompt_inputs = []
94
  history_inputs = []
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  for i in range(prompt_size):
97
  speech_tokens_i = [token+self.config.hf_config.speech_token_offset for token in prompt_speech_tokens[i].tolist()]
98
  speech_tokens_i += [self.config.hf_config.eos_token_id]
99
+ if use_dialect_prompt and len(dialect_prompt_text_tokens_for_llm[i])>0:
100
+ dialect_prompt_input = prompt_text_tokens_for_llm[i] + speech_tokens_i + dialect_prompt_text_tokens_for_llm[i]
101
  if i>0:
102
+ dialect_prompt_input = dialect_prefix[0] + dialect_prompt_input
103
+ prompt_input = self.llm.generate(dialect_prompt_input, sampling_params, past_key_values=None)['token_ids']
104
+ prompt_inputs.append(dialect_prefix[i+1]+dialect_prompt_text_tokens_for_llm[i] + prompt_input)
105
+ history_inputs.append(dialect_prefix[i+1]+dialect_prompt_text_tokens_for_llm[i] + prompt_input)
106
  else:
107
  prompt_inputs.append(prompt_text_tokens_for_llm[i] + speech_tokens_i )
108
  history_inputs.append(prompt_text_tokens_for_llm[i] + speech_tokens_i )
 
145
  flow_inputs_len = torch.tensor([len(prompt_speech_token) + len(generated_speech_tokens)])
146
 
147
 
 
148
  start_idx = spk_ids[i]
149
  prompt_mels = prompt_mels_for_flow[start_idx][None]
150
  prompt_mels_lens = prompt_mels_lens_for_flow[start_idx][None]
 
160
 
161
 
162
  mel = generated_mels[:, :, prompt_mels_lens[0].item():generated_mels_lens[0].item()]
163
+ wav, _ = self.hift(speech_feat=mel)
 
 
 
 
164
  generated_wavs.append(wav)
165
 
166
 
soulxpodcast/utils/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/__init__.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/__init__.cpython-311.pyc differ
 
soulxpodcast/utils/__pycache__/audio.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/audio.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/audio.cpython-311.pyc differ
 
soulxpodcast/utils/__pycache__/commons.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/commons.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/commons.cpython-311.pyc differ
 
soulxpodcast/utils/__pycache__/dataloader.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/dataloader.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/dataloader.cpython-311.pyc differ
 
soulxpodcast/utils/__pycache__/infer_utils.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/infer_utils.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/infer_utils.cpython-311.pyc differ
 
soulxpodcast/utils/__pycache__/parser.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/parser.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/parser.cpython-311.pyc differ
 
soulxpodcast/utils/__pycache__/text.cpython-311.pyc CHANGED
Binary files a/soulxpodcast/utils/__pycache__/text.cpython-311.pyc and b/soulxpodcast/utils/__pycache__/text.cpython-311.pyc differ
 
soulxpodcast/utils/audio.py CHANGED
@@ -1,5 +1,5 @@
1
- import numpy as np
2
  import torch
 
3
  from librosa.filters import mel as librosa_mel_fn
4
  from scipy.io.wavfile import read
5
 
@@ -75,3 +75,49 @@ def mel_spectrogram(y, n_fft=1920, num_mels=80, sampling_rate=24000, hop_size=48
75
  spec = spectral_normalize_torch(spec)
76
 
77
  return spec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ import numpy as np
3
  from librosa.filters import mel as librosa_mel_fn
4
  from scipy.io.wavfile import read
5
 
 
75
  spec = spectral_normalize_torch(spec)
76
 
77
  return spec
78
+
79
+
80
+ def audio_volume_normalize(audio: torch.Tensor, coeff=0.1):
81
+ """
82
+ Normalize the volume of an audio signal.
83
+
84
+ Parameters:
85
+ audio (torch tensor): Input audio signal array.
86
+ coeff (float): Target coefficient for normalization, default is 0.1.
87
+
88
+ Returns:
89
+ torch tensor: The volume-normalized audio signal.
90
+ """
91
+
92
+ device = audio.device
93
+ audio = audio.cpu().numpy()
94
+ temp = np.sort(np.abs(audio))
95
+
96
+
97
+ if temp[-1] < 0.1:
98
+ scaling_factor = max(
99
+ temp[-1], 1e-3
100
+ )
101
+ audio = audio / scaling_factor * 0.1
102
+
103
+
104
+ temp = temp[temp > 0.01]
105
+ L = temp.shape[0]
106
+
107
+
108
+ if L <= 10:
109
+ return audio
110
+
111
+
112
+ volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
113
+
114
+
115
+ audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
116
+
117
+
118
+ max_value = np.max(np.abs(audio))
119
+ if max_value > 1:
120
+ audio = audio / max_value
121
+
122
+ audio = torch.from_numpy(audio).to(device)
123
+ return audio
soulxpodcast/utils/commons.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import numpy as np
3
+ import torch
4
+
5
+
6
+ def set_all_random_seed(seed):
7
+ random.seed(seed)
8
+ np.random.seed(seed)
9
+ torch.manual_seed(seed)
10
+ torch.cuda.manual_seed_all(seed)
soulxpodcast/utils/dataloader.py CHANGED
@@ -11,8 +11,8 @@ import torchaudio.compliance.kaldi as kaldi
11
 
12
  import s3tokenizer
13
 
14
- from soulxpodcast.utils.audio import mel_spectrogram
15
  from soulxpodcast.utils.text import normalize_text
 
16
  from soulxpodcast.config import Config, SamplingParams
17
 
18
 
@@ -29,13 +29,13 @@ class PodcastDataset(Dataset):
29
 
30
  """Example data_list:
31
  ```
32
- {"key": "uttid_1", "prompt_text": ["prompt_text1", "prompt_text2"], "prompt_cot_text": ["prompt_cot_text1", "prompt_cot_text2"],
33
  "text": ["text1", "text2], "spk": [0, 1], "prompt_wav": ["/mnt/data/audio/00000000.wav", "/mnt/data/audio/00000001.wav"], "wav": "/mnt/data/audio_synthetic/uttid_1.wav"}
34
  ```
35
  Note:
36
  - `key` is the key of this sample.
37
  - `prompt_text` is the text used for prompt.
38
- - `prompt_cot_text` is the reshot text used for prompt.
39
  - `text` is the text used for generating real audio.
40
  - `spk` is the target speaker id to synthesize, corresponds to the prompt order. Default SPEAKER_0.
41
  - `prompt_wav` is the audio used for prompt.
@@ -82,16 +82,18 @@ class PodcastDataset(Dataset):
82
  def __getitem__(self, idx):
83
  data = self.datas[idx]
84
  try:
85
- prompt_text_ids_list, prompt_cot_text_ids_list, spk_emb_list, mel_list, mel_len_list, log_mel_list = (
86
  [], [], [], [], [], []
87
  )
88
 
89
- use_prompt_cot = "prompt_cot_text" in data
90
- prompt_cot_prefix_list = []
91
- prompt_cot_prefix_list.append(self.text_tokenizer.encode(f"{TASK_PODCAST}"))
92
  for spk_idx, (prompt_text, prompt_wav) in enumerate(zip(data["prompt_text"], data["prompt_wav"])):
93
 
94
  audio = s3tokenizer.load_audio(prompt_wav, sr=16000)
 
 
95
  log_mel = s3tokenizer.log_mel_spectrogram(audio)
96
 
97
 
@@ -103,7 +105,9 @@ class PodcastDataset(Dataset):
103
 
104
 
105
  audio, sample_rate = torchaudio.load(prompt_wav, backend='soundfile')
106
- audio = audio.mean(dim=0, keepdim=True)
 
 
107
  if sample_rate != 24000:
108
  audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)(audio)
109
  mel = mel_spectrogram(audio).transpose(1, 2).squeeze(0)
@@ -118,15 +122,19 @@ class PodcastDataset(Dataset):
118
  prompt_text = f"{TASK_PODCAST}{prompt_text}"
119
  prompt_text_ids = self.text_tokenizer.encode(prompt_text)
120
  prompt_text_ids_list.append(prompt_text_ids)
121
- if use_prompt_cot:
122
- prompt_cot_text = normalize_text(data["prompt_cot_text"][spk_idx])
123
- prompt_cot_text = f"{SPK_DICT[spk_idx]}{TEXT_START}{prompt_cot_text}{TEXT_END}{AUDIO_START}"
124
- prompt_cot_text_ids = self.text_tokenizer.encode(prompt_cot_text)
125
- prompt_cot_text_ids_list.append(prompt_cot_text_ids)
 
 
 
 
126
  if spk_idx == 0:
127
- prompt_cot_prefix_list.append(self.text_tokenizer.encode(f"{TASK_PODCAST}"))
128
  else:
129
- prompt_cot_prefix_list.append([])
130
  log_mel_list.append(log_mel)
131
  spk_emb_list.append(spk_emb)
132
  mel_list.append(mel); mel_len_list.append(mel_len)
@@ -134,11 +142,14 @@ class PodcastDataset(Dataset):
134
  "prompt_text_tokens": prompt_text_ids_list,
135
  "spk_emb": spk_emb_list, "mel": mel_list, "mel_len": mel_len_list, "log_mel": log_mel_list, "info": data,
136
  }
137
- if use_prompt_cot:
 
 
 
138
  item.update({
139
- "use_prompt_cot": True,
140
- "prompt_cot_text_tokens": prompt_cot_text_ids_list,
141
- "prompt_cot_prefix": prompt_cot_prefix_list,
142
  })
143
  text_ids_list, spks_list = [], []
144
  if "spk" not in data:
@@ -170,12 +181,12 @@ class PodcastInferHandler(PodcastDataset):
170
 
171
  """Example data_list:
172
  ```
173
- {"key": "uttid_1", "prompt_text": ["prompt_text1", "prompt_text2"], "prompt_cot_text": ["prompt_cot_text1", "prompt_cot_text2"], "text": ["text1", "text2], "spk": [0, 1], "prompt_wav": ["/mnt/data/audio/00000000.wav", "/mnt/data/audio/00000001.wav"], "wav": "/mnt/data/audio_synthetic/uttid_1.wav"}
174
  ```
175
  Note:
176
  - `key` is the key of this sample.
177
  - `prompt_text` is the text used for prompt.
178
- - `prompt_cot_text` is the cot text used for prompt as to activate specific ability.
179
  - `text` is the text used for generating real audio.
180
  - `spk` is the target speaker id to synthesize, corresponds to the prompt order. Default SPEAKER_0.
181
  - `prompt_wav` is the audio used for prompt.
 
11
 
12
  import s3tokenizer
13
 
 
14
  from soulxpodcast.utils.text import normalize_text
15
+ from soulxpodcast.utils.audio import mel_spectrogram, audio_volume_normalize
16
  from soulxpodcast.config import Config, SamplingParams
17
 
18
 
 
29
 
30
  """Example data_list:
31
  ```
32
+ {"key": "uttid_1", "prompt_text": ["prompt_text1", "prompt_text2"], "dialect_prompt_text": ["dialect_prompt_text1", "dialect_prompt_text2"],
33
  "text": ["text1", "text2], "spk": [0, 1], "prompt_wav": ["/mnt/data/audio/00000000.wav", "/mnt/data/audio/00000001.wav"], "wav": "/mnt/data/audio_synthetic/uttid_1.wav"}
34
  ```
35
  Note:
36
  - `key` is the key of this sample.
37
  - `prompt_text` is the text used for prompt.
38
+ - `dialect_prompt_text` is the reshot text used for prompt.
39
  - `text` is the text used for generating real audio.
40
  - `spk` is the target speaker id to synthesize, corresponds to the prompt order. Default SPEAKER_0.
41
  - `prompt_wav` is the audio used for prompt.
 
82
  def __getitem__(self, idx):
83
  data = self.datas[idx]
84
  try:
85
+ prompt_text_ids_list, dialect_prompt_text_ids_list, spk_emb_list, mel_list, mel_len_list, log_mel_list = (
86
  [], [], [], [], [], []
87
  )
88
 
89
+ use_dialect_prompt = "dialect_prompt_text" in data
90
+ dialect_prefix_list = []
91
+ dialect_prefix_list.append(self.text_tokenizer.encode(f"{TASK_PODCAST}"))
92
  for spk_idx, (prompt_text, prompt_wav) in enumerate(zip(data["prompt_text"], data["prompt_wav"])):
93
 
94
  audio = s3tokenizer.load_audio(prompt_wav, sr=16000)
95
+ audio = audio_volume_normalize(audio)
96
+
97
  log_mel = s3tokenizer.log_mel_spectrogram(audio)
98
 
99
 
 
105
 
106
 
107
  audio, sample_rate = torchaudio.load(prompt_wav, backend='soundfile')
108
+ audio = audio[0]
109
+ audio = audio_volume_normalize(audio).unsqueeze(0)
110
+
111
  if sample_rate != 24000:
112
  audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)(audio)
113
  mel = mel_spectrogram(audio).transpose(1, 2).squeeze(0)
 
122
  prompt_text = f"{TASK_PODCAST}{prompt_text}"
123
  prompt_text_ids = self.text_tokenizer.encode(prompt_text)
124
  prompt_text_ids_list.append(prompt_text_ids)
125
+ if use_dialect_prompt:
126
+
127
+ dialect_prompt_text = normalize_text(data["dialect_prompt_text"][spk_idx])
128
+ dialect_prompt_text = f"{SPK_DICT[spk_idx]}{TEXT_START}{dialect_prompt_text}{TEXT_END}{AUDIO_START}"
129
+ dialect_prompt_text_ids = self.text_tokenizer.encode(dialect_prompt_text)
130
+ # import pdb; pdb.set_trace()
131
+ print(f"dialect_prompt_text_ids: {dialect_prompt_text_ids}")
132
+ print(f"dialect_prompt_text: {dialect_prompt_text}")
133
+ dialect_prompt_text_ids_list.append(dialect_prompt_text_ids)
134
  if spk_idx == 0:
135
+ dialect_prefix_list.append(self.text_tokenizer.encode(f"{TASK_PODCAST}"))
136
  else:
137
+ dialect_prefix_list.append([])
138
  log_mel_list.append(log_mel)
139
  spk_emb_list.append(spk_emb)
140
  mel_list.append(mel); mel_len_list.append(mel_len)
 
142
  "prompt_text_tokens": prompt_text_ids_list,
143
  "spk_emb": spk_emb_list, "mel": mel_list, "mel_len": mel_len_list, "log_mel": log_mel_list, "info": data,
144
  }
145
+ if use_dialect_prompt:
146
+ import pdb; pdb.set_trace()
147
+ print(f"dialect_prompt_text_ids: {dialect_prompt_text_ids}")
148
+ print(f"dialect_prompt_text: {dialect_prompt_text}")
149
  item.update({
150
+ "use_dialect_prompt": True,
151
+ "dialect_prompt_text_tokens": dialect_prompt_text_ids_list,
152
+ "dialect_prefix": dialect_prefix_list,
153
  })
154
  text_ids_list, spks_list = [], []
155
  if "spk" not in data:
 
181
 
182
  """Example data_list:
183
  ```
184
+ {"key": "uttid_1", "prompt_text": ["prompt_text1", "prompt_text2"], "dialect_prompt_text": ["dialect_prompt_text1", "dialect_prompt_text2"], "text": ["text1", "text2], "spk": [0, 1], "prompt_wav": ["/mnt/data/audio/00000000.wav", "/mnt/data/audio/00000001.wav"], "wav": "/mnt/data/audio_synthetic/uttid_1.wav"}
185
  ```
186
  Note:
187
  - `key` is the key of this sample.
188
  - `prompt_text` is the text used for prompt.
189
+ - `dialect_prompt_text` is the cot text used for prompt as to activate specific ability.
190
  - `text` is the text used for generating real audio.
191
  - `spk` is the target speaker id to synthesize, corresponds to the prompt order. Default SPEAKER_0.
192
  - `prompt_wav` is the audio used for prompt.
soulxpodcast/utils/infer_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import torch
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from datetime import datetime
7
+
8
+ import s3tokenizer
9
+
10
+ from soulxpodcast.models.soulxpodcast import SoulXPodcast
11
+ from soulxpodcast.utils.dataloader import PodcastInferHandler
12
+ from soulxpodcast.utils.commons import set_all_random_seed
13
+ from soulxpodcast.config import Config, SoulXPodcastLLMConfig, SamplingParams
14
+
15
+
16
+ def initiate_model(seed, model_path, llm_engine, fp16_flow):
17
+ set_all_random_seed(seed)
18
+
19
+ hf_config = SoulXPodcastLLMConfig.from_initial_and_json(
20
+ initial_values={"fp16_flow": fp16_flow},
21
+ json_file=f"{model_path}/soulxpodcast_config.json"
22
+ )
23
+ if llm_engine == "vllm":
24
+ import importlib.util
25
+ if not importlib.util.find_spec("vllm"):
26
+ llm_engine = "hf"
27
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
28
+ tqdm.write(f"[{timestamp}] - [WARNING]: No install VLLM, switch to hf engine.")
29
+
30
+ config = Config(model=model_path, enforce_eager=True, llm_engine=llm_engine, hf_config=hf_config)
31
+ model = SoulXPodcast(config)
32
+
33
+ dataset = PodcastInferHandler(model.llm.tokenizer, None, config)
34
+
35
+ return model, dataset
36
+
37
+
38
+ def process_single_input(dataset, target_text_list, prompt_wav_list, prompt_text_list, use_dialect_prompt, dialect_prompt_text_list):
39
+ spks, texts = [], []
40
+ for target_text in target_text_list:
41
+ pattern = r'(\[S[1-9]\])(.+)'
42
+ match = re.match(pattern, target_text)
43
+ text, spk = match.group(2), int(match.group(1)[2])-1
44
+ spks.append(spk)
45
+ texts.append(text)
46
+
47
+ dataitem = {"key": "001", "prompt_text": prompt_text_list, "prompt_wav": prompt_wav_list,
48
+ "text": texts, "spk": spks, }
49
+ if use_dialect_prompt:
50
+ dataitem.update({
51
+ "dialect_prompt_text": dialect_prompt_text_list
52
+ })
53
+ dataset.update_datasource(
54
+ [
55
+ dataitem
56
+ ]
57
+ )
58
+
59
+
60
+ data = dataset[0]
61
+ prompt_mels_for_llm, prompt_mels_lens_for_llm = s3tokenizer.padding(data["log_mel"])
62
+ spk_emb_for_flow = torch.tensor(data["spk_emb"])
63
+ prompt_mels_for_flow = torch.nn.utils.rnn.pad_sequence(data["mel"], batch_first=True, padding_value=0)
64
+ prompt_mels_lens_for_flow = torch.tensor(data['mel_len'])
65
+ text_tokens_for_llm = data["text_tokens"]
66
+ prompt_text_tokens_for_llm = data["prompt_text_tokens"]
67
+ spk_ids = data["spks_list"]
68
+ sampling_params = SamplingParams(use_ras=True,win_size=25,tau_r=0.2)
69
+ infos = [data["info"]]
70
+ processed_data = {
71
+ "prompt_mels_for_llm": prompt_mels_for_llm,
72
+ "prompt_mels_lens_for_llm": prompt_mels_lens_for_llm,
73
+ "prompt_text_tokens_for_llm": prompt_text_tokens_for_llm,
74
+ "text_tokens_for_llm": text_tokens_for_llm,
75
+ "prompt_mels_for_flow_ori": prompt_mels_for_flow,
76
+ "prompt_mels_lens_for_flow": prompt_mels_lens_for_flow,
77
+ "spk_emb_for_flow": spk_emb_for_flow,
78
+ "sampling_params": sampling_params,
79
+ "spk_ids": spk_ids,
80
+ "infos": infos,
81
+ "use_dialect_prompt": use_dialect_prompt,
82
+ }
83
+ if use_dialect_prompt:
84
+ processed_data.update({
85
+ "dialect_prompt_text_tokens_for_llm": data["dialect_prompt_text_tokens"],
86
+ "dialect_prefix": data["dialect_prefix"],
87
+ })
88
+ return processed_data
89
+
90
+
91
+ def check_models(model_path, inputs):
92
+ if inputs['use_dialect_prompt']:
93
+ assert 'dialect' in model_path, "Dialect prompt is used, you should use a dialect model."
94
+
95
+ return True
soulxpodcast/utils/parser.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import datetime
3
+ import os
4
+
5
+
6
+ def generate_time_index():
7
+ """Generate a time-based unique key, e.g. '20251023-001'."""
8
+ now = datetime.datetime.now()
9
+ return now.strftime("%Y%m%d-%H%M%S")
10
+
11
+ def check_prefix(text):
12
+ prefixes = ["<|Henan|>", "<|Sichuan|>", "<|Yue|>"]
13
+
14
+ for prefix in prefixes:
15
+ if text.startswith(prefix):
16
+ return True
17
+ return False
18
+
19
+
20
+ def podcast_format_parser(data, output_dir="outputs"):
21
+ """
22
+ Parse the original multi-speaker podcast JSON to the target flattened format.
23
+ The key will be a time-based unique ID.
24
+ Args:
25
+ data (dict): input JSON data with 'speakers' and 'text' fields
26
+ output_dir (str): directory for output wav file path
27
+ Returns:
28
+ dict: converted format
29
+ """
30
+ speakers = data.get("speakers", {})
31
+ text_entries = data.get("text", [])
32
+
33
+
34
+ spk2id = {name: idx for idx, name in enumerate(speakers.keys())}
35
+
36
+
37
+ prompt_text = []
38
+ prompt_wav = []
39
+ dialect_prompt_text = []
40
+
41
+ for name in speakers:
42
+ prompt_text.append(speakers[name].get("prompt_text", ""))
43
+ prompt_wav.append(speakers[name].get("prompt_audio", ""))
44
+ dialect_prompt_text.append(speakers[name].get("dialect_prompt", ""))
45
+
46
+
47
+ text_list = []
48
+ spk_list = []
49
+ for turn in text_entries:
50
+ if len(turn) == 2:
51
+ spk_name, utt_text = turn
52
+ text = f'[{spk_name}]{utt_text}'
53
+ text_list.append(text)
54
+ spk_list.append(spk2id.get(spk_name, -1))
55
+
56
+
57
+ key = generate_time_index()
58
+ wav_path = os.path.join(output_dir, f"{key}.wav")
59
+
60
+ use_dialect_prompt = False
61
+ for dialect_text in dialect_prompt_text:
62
+ if len(dialect_text) > 0:
63
+ assert check_prefix(dialect_text), f"Unknown dialect prefix: {dialect_text} \
64
+ \n Prefix should be one of: <|Henan|>, <|Sichuan|>, <|Yue|>"
65
+ use_dialect_prompt = True
66
+
67
+ result = {
68
+ "key": key,
69
+ "prompt_text": prompt_text,
70
+ "prompt_wav": prompt_wav,
71
+ "text": text_list,
72
+ "spk": spk_list,
73
+ "wav": wav_path,
74
+ "use_dialect_prompt": use_dialect_prompt,
75
+ "dialect_prompt_text": dialect_prompt_text
76
+ }
77
+
78
+ return result
79
+
80
+
81
+
82
+ if __name__ == "__main__":
83
+ with open("example/podcast_script/script_henan.json", "r", encoding="utf-8") as f:
84
+ data = json.load(f)
85
+
86
+ converted = podcast_format_parser(data)
87
+ print(json.dumps(converted, ensure_ascii=False, indent=2))
soulxpodcast/utils/text.py CHANGED
@@ -1,5 +1,5 @@
1
  import re
2
-
3
 
4
  def remove_space_between_chinese(text):
5
 
@@ -41,3 +41,42 @@ def normalize_text(current_text):
41
  current_text += "."
42
 
43
  return current_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ from typing import List
3
 
4
  def remove_space_between_chinese(text):
5
 
 
41
  current_text += "."
42
 
43
  return current_text
44
+
45
+
46
+ def check_monologue_text(text: str, prefix: str = None) -> bool:
47
+ text = text.strip()
48
+
49
+ if prefix is not None and (not text.startswith(prefix)):
50
+ return False
51
+
52
+ if prefix is not None:
53
+ text = text.removeprefix(prefix)
54
+ text = text.strip()
55
+
56
+ if len(text) == 0:
57
+ return False
58
+ return True
59
+
60
+ def check_dialect_prompt_text(text: str, prefix: str = None) -> bool:
61
+ text = text.strip()
62
+
63
+ if prefix is not None and (not text.startswith(prefix)):
64
+ return False
65
+ text = text.strip()
66
+
67
+ if len(text) == 0:
68
+ return False
69
+ return True
70
+
71
+ def check_dialogue_text(text_list: List[str]) -> bool:
72
+ if len(text_list) == 0:
73
+ return False
74
+ for text in text_list:
75
+ if not (
76
+ check_monologue_text(text, "[S1]")
77
+ or check_monologue_text(text, "[S2]")
78
+ or check_monologue_text(text, "[S3]")
79
+ or check_monologue_text(text, "[S4]")
80
+ ):
81
+ return False
82
+ return True