File size: 5,549 Bytes
12b9ec8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
data:
tokenizer: null
train_files: dapo_ds_train_sample.parquet
val_files: matheval.parquet
prompt_key: prompt
reward_fn_key: data_source
max_prompt_length: 768
max_response_length: 13312
train_batch_size: 1024
val_batch_size: null
return_raw_input_ids: false
return_raw_chat: false
shuffle: true
filter_overlong_prompts: true
filter_overlong_prompts_workers: 1
truncation: error
image_key: images
custom_cls:
path: null
name: null
actor_rollout_ref:
hybrid_engine: true
model:
path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
external_lib: null
override_config: {}
enable_gradient_checkpointing: true
use_remove_padding: true
use_liger: false
save_hf_repo_id: RyanYr/ppo-dapo-r1qwen1.5B-base-lr-mbs256-gpu8_actor
tokenizer_chat_template: null
actor:
strategy: fsdp
ppo_mini_batch_size: 256
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
clip_ratio: 0.2
clip_ratio_low: 0.2
clip_ratio_high: 0.2
clip_ratio_c: 3.0
loss_agg_mode: token-mean
entropy_coeff: 0.001
use_kl_loss: false
use_torch_compile: true
kl_loss_coef: 0.001
kl_loss_type: low_var_kl
ppo_epochs: 1
shuffle: false
ulysses_sequence_parallel_size: 1
checkpoint:
contents:
- model
- optimizer
- extra
optim:
lr: 1.0e-06
lr_warmup_steps: -1
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 100
weight_decay: 0.01
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
optimizer_offload: false
fsdp_size: -1
ref:
strategy: fsdp
fsdp_config:
param_offload: false
wrap_policy:
min_num_params: 0
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
ulysses_sequence_parallel_size: 1
rollout:
name: vllm
temperature: 1.0
top_k: -1
top_p: 1
use_fire_sampling: false
prompt_length: 768
response_length: 13312
dtype: bfloat16
gpu_memory_utilization: 0.7
ignore_eos: false
enforce_eager: false
free_cache_engine: false
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 14080
max_model_len: null
max_num_seqs: 1024
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
disable_log_stats: true
enable_chunked_prefill: true
do_sample: true
'n': 1
engine_kwargs:
swap_space: null
val_kwargs:
top_k: -1
top_p: 1.0
temperature: 0
'n': 1
do_sample: false
critic:
rollout_n: 1
strategy: fsdp
optim:
lr: 1.0e-05
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 100
weight_decay: 0.01
model:
path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
tokenizer_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
override_config: {}
external_lib: null
enable_gradient_checkpointing: false
use_remove_padding: true
fsdp_config:
param_offload: false
optimizer_offload: false
wrap_policy:
min_num_params: 0
fsdp_size: -1
save_hf_repo_id: RyanYr/ppo-dapo-r1qwen1.5B-base-lr-mbs256-gpu8_critic
ppo_mini_batch_size: 256
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 1
forward_micro_batch_size: null
forward_micro_batch_size_per_gpu: 1
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 32768
forward_max_token_len_per_gpu: 32768
ulysses_sequence_parallel_size: 1
ppo_epochs: 1
shuffle: false
grad_clip: 1.0
cliprange_value: 0.5
checkpoint:
contents:
- model
- optimizer
- extra
reward_model:
enable: false
strategy: fsdp
model:
input_tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: null
use_remove_padding: false
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
forward_max_token_len_per_gpu: 32768
reward_manager: prime
custom_reward_function:
path: null
name: compute_score
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: gae
use_kl_in_reward: true
kl_penalty: kl
kl_ctrl:
type: fixed
kl_coef: 0.001
horizon: 10000
target_kl: 0.1
trainer:
balance_batch: true
total_epochs: 100
total_training_steps: 100
project_name: value-LLM
experiment_name: ppo-dapo-r1qwen1.5B-base_lr-mbs256-gpu8
logger:
- console
- wandb
log_val_generations: 0
nnodes: 1
n_gpus_per_node: 8
save_freq: 5
resume_mode: auto
resume_from_path: null
val_before_train: false
test_freq: -1
critic_warmup: 0
default_hdfs_dir: null
del_local_ckpt_after_load: false
default_local_dir: checkpoints/value-LLM/ppo-dapo-r1qwen1.5B-base_lr-mbs256-gpu8
max_actor_ckpt_to_keep: 1
max_critic_ckpt_to_keep: 1
ray_wait_register_center_timeout: 300
hf_token: null
resume_from_hf:
enable: false
actor_hf_repo_id: null
actor_revision: main
critic_hf_repo_id: null
critic_revision: main
hf_token: null
|