|
data: |
|
tokenizer: null |
|
train_files: aime24_ds_train_sample.parquet |
|
val_files: matheval.parquet |
|
prompt_key: prompt |
|
reward_fn_key: data_source |
|
max_prompt_length: 1024 |
|
max_response_length: 3072 |
|
train_batch_size: 256 |
|
val_batch_size: null |
|
return_raw_input_ids: false |
|
return_raw_chat: false |
|
shuffle: true |
|
filter_overlong_prompts: true |
|
filter_overlong_prompts_workers: 1 |
|
truncation: error |
|
image_key: images |
|
custom_cls: |
|
path: null |
|
name: null |
|
actor_rollout_ref: |
|
hybrid_engine: true |
|
model: |
|
path: Qwen/Qwen2.5-Math-1.5B |
|
external_lib: null |
|
override_config: {} |
|
enable_gradient_checkpointing: true |
|
use_remove_padding: true |
|
use_liger: false |
|
save_hf_repo_id: RyanYr/grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor |
|
tokenizer_chat_template: null |
|
actor: |
|
strategy: fsdp |
|
ppo_mini_batch_size: 128 |
|
ppo_micro_batch_size: null |
|
ppo_micro_batch_size_per_gpu: 16 |
|
use_dynamic_bsz: false |
|
ppo_max_token_len_per_gpu: 16384 |
|
grad_clip: 1.0 |
|
clip_ratio: 0.2 |
|
clip_ratio_low: 0.2 |
|
clip_ratio_high: 0.2 |
|
clip_ratio_c: 3.0 |
|
loss_agg_mode: token-mean |
|
entropy_coeff: 0 |
|
use_kl_loss: true |
|
use_torch_compile: true |
|
kl_loss_coef: 0.001 |
|
kl_loss_type: low_var_kl |
|
ppo_epochs: 1 |
|
shuffle: false |
|
ulysses_sequence_parallel_size: 1 |
|
checkpoint: |
|
contents: |
|
- model |
|
- optimizer |
|
- extra |
|
optim: |
|
lr: 1.0e-06 |
|
lr_warmup_steps: -1 |
|
lr_warmup_steps_ratio: 0.0 |
|
min_lr_ratio: null |
|
warmup_style: constant |
|
total_training_steps: 2000 |
|
weight_decay: 0.01 |
|
fsdp_config: |
|
wrap_policy: |
|
min_num_params: 0 |
|
param_offload: false |
|
optimizer_offload: false |
|
fsdp_size: -1 |
|
ref: |
|
ref_model_path: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4_actor_1230-975b46d |
|
strategy: fsdp |
|
fsdp_config: |
|
param_offload: false |
|
wrap_policy: |
|
min_num_params: 0 |
|
log_prob_micro_batch_size: null |
|
log_prob_micro_batch_size_per_gpu: 64 |
|
log_prob_use_dynamic_bsz: false |
|
log_prob_max_token_len_per_gpu: 16384 |
|
ulysses_sequence_parallel_size: 1 |
|
rollout: |
|
name: vllm |
|
temperature: 1.0 |
|
top_k: -1 |
|
top_p: 1 |
|
use_fire_sampling: false |
|
prompt_length: 1024 |
|
response_length: 3072 |
|
dtype: bfloat16 |
|
gpu_memory_utilization: 0.75 |
|
ignore_eos: false |
|
enforce_eager: false |
|
free_cache_engine: false |
|
load_format: dummy_dtensor |
|
tensor_model_parallel_size: 4 |
|
max_num_batched_tokens: 4096 |
|
max_model_len: null |
|
max_num_seqs: 1024 |
|
log_prob_micro_batch_size: null |
|
log_prob_micro_batch_size_per_gpu: 64 |
|
log_prob_use_dynamic_bsz: false |
|
log_prob_max_token_len_per_gpu: 16384 |
|
disable_log_stats: true |
|
enable_chunked_prefill: true |
|
do_sample: true |
|
'n': 4 |
|
engine_kwargs: |
|
swap_space: null |
|
val_kwargs: |
|
top_k: -1 |
|
top_p: 1.0 |
|
temperature: 0 |
|
'n': 1 |
|
do_sample: false |
|
critic: |
|
rollout_n: 4 |
|
strategy: fsdp |
|
optim: |
|
lr: 1.0e-05 |
|
lr_warmup_steps_ratio: 0.0 |
|
min_lr_ratio: null |
|
warmup_style: constant |
|
total_training_steps: 2000 |
|
weight_decay: 0.01 |
|
model: |
|
path: ~/models/deepseek-llm-7b-chat |
|
tokenizer_path: Qwen/Qwen2.5-Math-1.5B |
|
override_config: {} |
|
external_lib: null |
|
enable_gradient_checkpointing: true |
|
use_remove_padding: false |
|
fsdp_config: |
|
param_offload: false |
|
optimizer_offload: false |
|
wrap_policy: |
|
min_num_params: 0 |
|
fsdp_size: -1 |
|
save_hf_repo_id: null |
|
ppo_mini_batch_size: 128 |
|
ppo_micro_batch_size: null |
|
ppo_micro_batch_size_per_gpu: null |
|
forward_micro_batch_size: null |
|
forward_micro_batch_size_per_gpu: null |
|
use_dynamic_bsz: false |
|
ppo_max_token_len_per_gpu: 32768 |
|
forward_max_token_len_per_gpu: 32768 |
|
ulysses_sequence_parallel_size: 1 |
|
ppo_epochs: 1 |
|
shuffle: false |
|
grad_clip: 1.0 |
|
cliprange_value: 0.5 |
|
checkpoint: |
|
contents: |
|
- model |
|
- optimizer |
|
- extra |
|
reward_model: |
|
enable: false |
|
strategy: fsdp |
|
model: |
|
input_tokenizer: Qwen/Qwen2.5-Math-1.5B |
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1 |
|
external_lib: null |
|
use_remove_padding: false |
|
fsdp_config: |
|
wrap_policy: |
|
min_num_params: 0 |
|
param_offload: false |
|
fsdp_size: -1 |
|
micro_batch_size: null |
|
micro_batch_size_per_gpu: null |
|
max_length: null |
|
ulysses_sequence_parallel_size: 1 |
|
use_dynamic_bsz: false |
|
forward_max_token_len_per_gpu: 32768 |
|
reward_manager: prime |
|
custom_reward_function: |
|
path: null |
|
name: compute_score |
|
algorithm: |
|
gamma: 1.0 |
|
lam: 1.0 |
|
adv_estimator: grpo_neg |
|
use_kl_in_reward: false |
|
grpo_neg: |
|
mean_penalty: 0.001 |
|
kl_penalty: kl |
|
kl_ctrl: |
|
type: fixed |
|
kl_coef: 0.001 |
|
horizon: 10000 |
|
target_kl: 0.1 |
|
trainer: |
|
balance_batch: true |
|
total_epochs: 1000000000000 |
|
total_training_steps: 2000 |
|
project_name: value-LLM |
|
experiment_name: grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d |
|
logger: |
|
- console |
|
- wandb |
|
log_val_generations: 0 |
|
nnodes: 1 |
|
n_gpus_per_node: 4 |
|
save_freq: 5 |
|
resume_mode: auto |
|
resume_from_path: null |
|
val_before_train: false |
|
test_freq: -1 |
|
critic_warmup: 0 |
|
default_hdfs_dir: null |
|
del_local_ckpt_after_load: false |
|
default_local_dir: checkpoints/value-LLM/grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d |
|
max_actor_ckpt_to_keep: 1 |
|
max_critic_ckpt_to_keep: 1 |
|
ray_wait_register_center_timeout: 300 |
|
hf_token: null |
|
resume_from_hf: |
|
enable: true |
|
actor_hf_repo_id: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref895-82bb89a_actor |
|
actor_revision: 975b46d1ee3ee658c46b85220a34a95c384f4078 |
|
critic_hf_repo_id: null |
|
critic_revision: main |
|
hf_token: null |
|
|