File size: 5,549 Bytes
12b9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
data:
  tokenizer: null
  train_files: dapo_ds_train_sample.parquet
  val_files: matheval.parquet
  prompt_key: prompt
  reward_fn_key: data_source
  max_prompt_length: 768
  max_response_length: 13312
  train_batch_size: 1024
  val_batch_size: null
  return_raw_input_ids: false
  return_raw_chat: false
  shuffle: true
  filter_overlong_prompts: true
  filter_overlong_prompts_workers: 1
  truncation: error
  image_key: images
  custom_cls:
    path: null
    name: null
actor_rollout_ref:
  hybrid_engine: true
  model:
    path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    external_lib: null
    override_config: {}
    enable_gradient_checkpointing: true
    use_remove_padding: true
    use_liger: false
    save_hf_repo_id: RyanYr/ppo-dapo-r1qwen1.5B-base-lr-mbs256-gpu8_actor
    tokenizer_chat_template: null
  actor:
    strategy: fsdp
    ppo_mini_batch_size: 256
    ppo_micro_batch_size: null
    ppo_micro_batch_size_per_gpu: 1
    use_dynamic_bsz: false
    ppo_max_token_len_per_gpu: 16384
    grad_clip: 1.0
    clip_ratio: 0.2
    clip_ratio_low: 0.2
    clip_ratio_high: 0.2
    clip_ratio_c: 3.0
    loss_agg_mode: token-mean
    entropy_coeff: 0.001
    use_kl_loss: false
    use_torch_compile: true
    kl_loss_coef: 0.001
    kl_loss_type: low_var_kl
    ppo_epochs: 1
    shuffle: false
    ulysses_sequence_parallel_size: 1
    checkpoint:
      contents:
      - model
      - optimizer
      - extra
    optim:
      lr: 1.0e-06
      lr_warmup_steps: -1
      lr_warmup_steps_ratio: 0.0
      min_lr_ratio: null
      warmup_style: constant
      total_training_steps: 100
      weight_decay: 0.01
    fsdp_config:
      wrap_policy:
        min_num_params: 0
      param_offload: false
      optimizer_offload: false
      fsdp_size: -1
  ref:
    strategy: fsdp
    fsdp_config:
      param_offload: false
      wrap_policy:
        min_num_params: 0
    log_prob_micro_batch_size: null
    log_prob_micro_batch_size_per_gpu: 1
    log_prob_use_dynamic_bsz: false
    log_prob_max_token_len_per_gpu: 16384
    ulysses_sequence_parallel_size: 1
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1
    top_p: 1
    use_fire_sampling: false
    prompt_length: 768
    response_length: 13312
    dtype: bfloat16
    gpu_memory_utilization: 0.7
    ignore_eos: false
    enforce_eager: false
    free_cache_engine: false
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 14080
    max_model_len: null
    max_num_seqs: 1024
    log_prob_micro_batch_size: null
    log_prob_micro_batch_size_per_gpu: 1
    log_prob_use_dynamic_bsz: false
    log_prob_max_token_len_per_gpu: 16384
    disable_log_stats: true
    enable_chunked_prefill: true
    do_sample: true
    'n': 1
    engine_kwargs:
      swap_space: null
    val_kwargs:
      top_k: -1
      top_p: 1.0
      temperature: 0
      'n': 1
      do_sample: false
critic:
  rollout_n: 1
  strategy: fsdp
  optim:
    lr: 1.0e-05
    lr_warmup_steps_ratio: 0.0
    min_lr_ratio: null
    warmup_style: constant
    total_training_steps: 100
    weight_decay: 0.01
  model:
    path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    tokenizer_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    override_config: {}
    external_lib: null
    enable_gradient_checkpointing: false
    use_remove_padding: true
    fsdp_config:
      param_offload: false
      optimizer_offload: false
      wrap_policy:
        min_num_params: 0
      fsdp_size: -1
    save_hf_repo_id: RyanYr/ppo-dapo-r1qwen1.5B-base-lr-mbs256-gpu8_critic
  ppo_mini_batch_size: 256
  ppo_micro_batch_size: null
  ppo_micro_batch_size_per_gpu: 1
  forward_micro_batch_size: null
  forward_micro_batch_size_per_gpu: 1
  use_dynamic_bsz: false
  ppo_max_token_len_per_gpu: 32768
  forward_max_token_len_per_gpu: 32768
  ulysses_sequence_parallel_size: 1
  ppo_epochs: 1
  shuffle: false
  grad_clip: 1.0
  cliprange_value: 0.5
  checkpoint:
    contents:
    - model
    - optimizer
    - extra
reward_model:
  enable: false
  strategy: fsdp
  model:
    input_tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: null
    use_remove_padding: false
    fsdp_config:
      wrap_policy:
        min_num_params: 0
      param_offload: false
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: false
  forward_max_token_len_per_gpu: 32768
  reward_manager: prime
custom_reward_function:
  path: null
  name: compute_score
algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: gae
  use_kl_in_reward: true
  kl_penalty: kl
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
    horizon: 10000
    target_kl: 0.1
trainer:
  balance_batch: true
  total_epochs: 100
  total_training_steps: 100
  project_name: value-LLM
  experiment_name: ppo-dapo-r1qwen1.5B-base_lr-mbs256-gpu8
  logger:
  - console
  - wandb
  log_val_generations: 0
  nnodes: 1
  n_gpus_per_node: 8
  save_freq: 5
  resume_mode: auto
  resume_from_path: null
  val_before_train: false
  test_freq: -1
  critic_warmup: 0
  default_hdfs_dir: null
  del_local_ckpt_after_load: false
  default_local_dir: checkpoints/value-LLM/ppo-dapo-r1qwen1.5B-base_lr-mbs256-gpu8
  max_actor_ckpt_to_keep: 1
  max_critic_ckpt_to_keep: 1
  ray_wait_register_center_timeout: 300
  hf_token: null
  resume_from_hf:
    enable: false
    actor_hf_repo_id: null
    actor_revision: main
    critic_hf_repo_id: null
    critic_revision: main
    hf_token: null