RyanYr's picture
Save model at global step 5
12b9ec8 verified
data:
tokenizer: null
train_files: dapo_ds_train_sample.parquet
val_files: matheval.parquet
prompt_key: prompt
reward_fn_key: data_source
max_prompt_length: 768
max_response_length: 13312
train_batch_size: 1024
val_batch_size: null
return_raw_input_ids: false
return_raw_chat: false
shuffle: true
filter_overlong_prompts: true
filter_overlong_prompts_workers: 1
truncation: error
image_key: images
custom_cls:
path: null
name: null
actor_rollout_ref:
hybrid_engine: true
model:
path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
external_lib: null
override_config: {}
enable_gradient_checkpointing: true
use_remove_padding: true
use_liger: false
save_hf_repo_id: RyanYr/ppo-dapo-r1qwen1.5B-base-lr-mbs256-gpu8_actor
tokenizer_chat_template: null
actor:
strategy: fsdp
ppo_mini_batch_size: 256
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
clip_ratio: 0.2
clip_ratio_low: 0.2
clip_ratio_high: 0.2
clip_ratio_c: 3.0
loss_agg_mode: token-mean
entropy_coeff: 0.001
use_kl_loss: false
use_torch_compile: true
kl_loss_coef: 0.001
kl_loss_type: low_var_kl
ppo_epochs: 1
shuffle: false
ulysses_sequence_parallel_size: 1
checkpoint:
contents:
- model
- optimizer
- extra
optim:
lr: 1.0e-06
lr_warmup_steps: -1
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 100
weight_decay: 0.01
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
optimizer_offload: false
fsdp_size: -1
ref:
strategy: fsdp
fsdp_config:
param_offload: false
wrap_policy:
min_num_params: 0
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
ulysses_sequence_parallel_size: 1
rollout:
name: vllm
temperature: 1.0
top_k: -1
top_p: 1
use_fire_sampling: false
prompt_length: 768
response_length: 13312
dtype: bfloat16
gpu_memory_utilization: 0.7
ignore_eos: false
enforce_eager: false
free_cache_engine: false
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 14080
max_model_len: null
max_num_seqs: 1024
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
disable_log_stats: true
enable_chunked_prefill: true
do_sample: true
'n': 1
engine_kwargs:
swap_space: null
val_kwargs:
top_k: -1
top_p: 1.0
temperature: 0
'n': 1
do_sample: false
critic:
rollout_n: 1
strategy: fsdp
optim:
lr: 1.0e-05
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 100
weight_decay: 0.01
model:
path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
tokenizer_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
override_config: {}
external_lib: null
enable_gradient_checkpointing: false
use_remove_padding: true
fsdp_config:
param_offload: false
optimizer_offload: false
wrap_policy:
min_num_params: 0
fsdp_size: -1
save_hf_repo_id: RyanYr/ppo-dapo-r1qwen1.5B-base-lr-mbs256-gpu8_critic
ppo_mini_batch_size: 256
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 1
forward_micro_batch_size: null
forward_micro_batch_size_per_gpu: 1
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 32768
forward_max_token_len_per_gpu: 32768
ulysses_sequence_parallel_size: 1
ppo_epochs: 1
shuffle: false
grad_clip: 1.0
cliprange_value: 0.5
checkpoint:
contents:
- model
- optimizer
- extra
reward_model:
enable: false
strategy: fsdp
model:
input_tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: null
use_remove_padding: false
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
forward_max_token_len_per_gpu: 32768
reward_manager: prime
custom_reward_function:
path: null
name: compute_score
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: gae
use_kl_in_reward: true
kl_penalty: kl
kl_ctrl:
type: fixed
kl_coef: 0.001
horizon: 10000
target_kl: 0.1
trainer:
balance_batch: true
total_epochs: 100
total_training_steps: 100
project_name: value-LLM
experiment_name: ppo-dapo-r1qwen1.5B-base_lr-mbs256-gpu8
logger:
- console
- wandb
log_val_generations: 0
nnodes: 1
n_gpus_per_node: 8
save_freq: 5
resume_mode: auto
resume_from_path: null
val_before_train: false
test_freq: -1
critic_warmup: 0
default_hdfs_dir: null
del_local_ckpt_after_load: false
default_local_dir: checkpoints/value-LLM/ppo-dapo-r1qwen1.5B-base_lr-mbs256-gpu8
max_actor_ckpt_to_keep: 1
max_critic_ckpt_to_keep: 1
ray_wait_register_center_timeout: 300
hf_token: null
resume_from_hf:
enable: false
actor_hf_repo_id: null
actor_revision: main
critic_hf_repo_id: null
critic_revision: main
hf_token: null