RyanYr's picture
Save model at global step 1235
a738e05 verified
data:
tokenizer: null
train_files: aime24_ds_train_sample.parquet
val_files: matheval.parquet
prompt_key: prompt
reward_fn_key: data_source
max_prompt_length: 1024
max_response_length: 3072
train_batch_size: 256
val_batch_size: null
return_raw_input_ids: false
return_raw_chat: false
shuffle: true
filter_overlong_prompts: true
filter_overlong_prompts_workers: 1
truncation: error
image_key: images
custom_cls:
path: null
name: null
actor_rollout_ref:
hybrid_engine: true
model:
path: Qwen/Qwen2.5-Math-1.5B
external_lib: null
override_config: {}
enable_gradient_checkpointing: true
use_remove_padding: true
use_liger: false
save_hf_repo_id: RyanYr/grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor
tokenizer_chat_template: null
actor:
strategy: fsdp
ppo_mini_batch_size: 128
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 16
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
clip_ratio: 0.2
clip_ratio_low: 0.2
clip_ratio_high: 0.2
clip_ratio_c: 3.0
loss_agg_mode: token-mean
entropy_coeff: 0
use_kl_loss: true
use_torch_compile: true
kl_loss_coef: 0.001
kl_loss_type: low_var_kl
ppo_epochs: 1
shuffle: false
ulysses_sequence_parallel_size: 1
checkpoint:
contents:
- model
- optimizer
- extra
optim:
lr: 1.0e-06
lr_warmup_steps: -1
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 2000
weight_decay: 0.01
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
optimizer_offload: false
fsdp_size: -1
ref:
ref_model_path: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4_actor_1230-975b46d
strategy: fsdp
fsdp_config:
param_offload: false
wrap_policy:
min_num_params: 0
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 64
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
ulysses_sequence_parallel_size: 1
rollout:
name: vllm
temperature: 1.0
top_k: -1
top_p: 1
use_fire_sampling: false
prompt_length: 1024
response_length: 3072
dtype: bfloat16
gpu_memory_utilization: 0.75
ignore_eos: false
enforce_eager: false
free_cache_engine: false
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 4096
max_model_len: null
max_num_seqs: 1024
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 64
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
disable_log_stats: true
enable_chunked_prefill: true
do_sample: true
'n': 4
engine_kwargs:
swap_space: null
val_kwargs:
top_k: -1
top_p: 1.0
temperature: 0
'n': 1
do_sample: false
critic:
rollout_n: 4
strategy: fsdp
optim:
lr: 1.0e-05
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 2000
weight_decay: 0.01
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: Qwen/Qwen2.5-Math-1.5B
override_config: {}
external_lib: null
enable_gradient_checkpointing: true
use_remove_padding: false
fsdp_config:
param_offload: false
optimizer_offload: false
wrap_policy:
min_num_params: 0
fsdp_size: -1
save_hf_repo_id: null
ppo_mini_batch_size: 128
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: null
forward_micro_batch_size_per_gpu: null
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 32768
forward_max_token_len_per_gpu: 32768
ulysses_sequence_parallel_size: 1
ppo_epochs: 1
shuffle: false
grad_clip: 1.0
cliprange_value: 0.5
checkpoint:
contents:
- model
- optimizer
- extra
reward_model:
enable: false
strategy: fsdp
model:
input_tokenizer: Qwen/Qwen2.5-Math-1.5B
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: null
use_remove_padding: false
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
forward_max_token_len_per_gpu: 32768
reward_manager: prime
custom_reward_function:
path: null
name: compute_score
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo_neg
use_kl_in_reward: false
grpo_neg:
mean_penalty: 0.001
kl_penalty: kl
kl_ctrl:
type: fixed
kl_coef: 0.001
horizon: 10000
target_kl: 0.1
trainer:
balance_batch: true
total_epochs: 1000000000000
total_training_steps: 2000
project_name: value-LLM
experiment_name: grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
logger:
- console
- wandb
log_val_generations: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 5
resume_mode: auto
resume_from_path: null
val_before_train: false
test_freq: -1
critic_warmup: 0
default_hdfs_dir: null
del_local_ckpt_after_load: false
default_local_dir: checkpoints/value-LLM/grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
max_actor_ckpt_to_keep: 1
max_critic_ckpt_to_keep: 1
ray_wait_register_center_timeout: 300
hf_token: null
resume_from_hf:
enable: true
actor_hf_repo_id: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref895-82bb89a_actor
actor_revision: 975b46d1ee3ee658c46b85220a34a95c384f4078
critic_hf_repo_id: null
critic_revision: main
hf_token: null