File size: 2,608 Bytes
c102e8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
accumulate_grad_batches: 1
base_config: config/config_base.yaml
batch_max_tokens: 12000
batch_size: 2
cfg_init: 1.0
cfg_scale: 4.0
cfg_schedule: linear
check_val_every_n_epoch: 10
clip_grad_norm: 0
data_dir: ''
debug: false
deep_speed_strategy_stage: 2
drop_last: true
dynamic_cfg: false
endless_ds: false
filter_args:
  lang:
  - zh
  - en
  max_spk_num: 6
  speech_ratio: 0.6
gradient_clip_val: 1.0
indexed_ds: true
infer: false
infer_exp_name: ''
infer_json_path: ''
inference_ckpt: ''
inference_mode: nonstreaming
learning_rate: 1e-4
limit_val_batches: 100
load_opt: false
log_interval: 10
logger_type: tensorboard
loss:
  lambda_fm: 1.0
  lambda_phone: 0.0
  mel_loss: l1
max_epochs: 1000
max_eval_sentences: -1
max_eval_tokens: -1
max_prompt_ratio: 0.5
max_segment_cnt: 20000
max_sentences: -1
max_speech_duration: 20
max_tokens: 31250
max_training_steps: 100000
max_updates: 160000
mel_mean: -4.479605
mel_std: 3.4584913
meta_dir: null
min_prompt_duration: 0.5
min_speech_duration: -1
model:
  condition_prenet_depth: 6
  dit:
    chunk_params:
      hz: 50
      max_chunk: 3.0
      max_chunk_history: 50000000
      min_chunk: 0.5
      need_block_shift: false
    condition_input_dim: 1280
    condition_type: discrete_codes
    depth: 16
    ffn_act_layer: gleu_tanh
    ffn_conv_kernel_size: 5
    ffn_gated_glu: false
    ffn_type: vanilla_mlp
    hidden_size: 2304
    input_size: 80
    max_seq_len: 4096
    mlp_ratio: 4.0
    num_heads: 18
    position_embedding_type: skip
    prompt_cfg_dropout: 0.2
    rope_params:
      max_position_embeddings: 4096
      rope_base: 10000.0
      rope_interpolation_factor: 1.0
    semantic_cfg_dropout: 0.2
    semantic_vocab_size: 16384
    use_chunk_setting: true
    use_rope: true
  phone_predictor:
    blank_id: 4
    phone_vocab_size: 5000
  position_id_start_from: 0
  random_position_start: true
  restart_position_ids: false
  use_condition_prenet: false
need_merge_same_speaker: true
need_precise_phones: false
no_verlap: true
normalize_mel: true
num_nodes: 1
num_sanity_val_steps: 0
num_workers: 1
ode_steps: 150
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
optimizer_class: adamw
pin_memory: true
precision: bf16-mixed
save_interval: 2000
save_topk: 10
seed: 1234
shuffle: true
sort_by_len: true
src_sample_rate: 16000
strategy: ddp
tensorboard_dir: tb_logs
test_num: 100
tgt_sample_rate: 24000
timescale: 80000
use_cfg: false
use_cfg_rescale: false
use_distributed_sampler: false
use_uncondition: false
val_check_interval: 2000000
vocoder_ckpt: ''
wandb_name: glm4_semantic_cfm_v2_debug
warmup_updates: 100
weight_decay: 0.0001