RyanYr
/

grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor

Model card Files Files and versions Community

grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor / config.yaml

RyanYr

Save model at global step 1235

a738e05 verified 4 days ago

raw

history blame contribute delete

5.77 kB

	data:
	tokenizer: null
	train_files: aime24_ds_train_sample.parquet
	val_files: matheval.parquet
	prompt_key: prompt
	reward_fn_key: data_source
	max_prompt_length: 1024
	max_response_length: 3072
	train_batch_size: 256
	val_batch_size: null
	return_raw_input_ids: false
	return_raw_chat: false
	shuffle: true
	filter_overlong_prompts: true
	filter_overlong_prompts_workers: 1
	truncation: error
	image_key: images
	custom_cls:
	path: null
	name: null
	actor_rollout_ref:
	hybrid_engine: true
	model:
	path: Qwen/Qwen2.5-Math-1.5B
	external_lib: null
	override_config: {}
	enable_gradient_checkpointing: true
	use_remove_padding: true
	use_liger: false
	save_hf_repo_id: RyanYr/grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor
	tokenizer_chat_template: null
	actor:
	strategy: fsdp
	ppo_mini_batch_size: 128
	ppo_micro_batch_size: null
	ppo_micro_batch_size_per_gpu: 16
	use_dynamic_bsz: false
	ppo_max_token_len_per_gpu: 16384
	grad_clip: 1.0
	clip_ratio: 0.2
	clip_ratio_low: 0.2
	clip_ratio_high: 0.2
	clip_ratio_c: 3.0
	loss_agg_mode: token-mean
	entropy_coeff: 0
	use_kl_loss: true
	use_torch_compile: true
	kl_loss_coef: 0.001
	kl_loss_type: low_var_kl
	ppo_epochs: 1
	shuffle: false
	ulysses_sequence_parallel_size: 1
	checkpoint:
	contents:
	- model
	- optimizer
	- extra
	optim:
	lr: 1.0e-06
	lr_warmup_steps: -1
	lr_warmup_steps_ratio: 0.0
	min_lr_ratio: null
	warmup_style: constant
	total_training_steps: 2000
	weight_decay: 0.01
	fsdp_config:
	wrap_policy:
	min_num_params: 0
	param_offload: false
	optimizer_offload: false
	fsdp_size: -1
	ref:
	ref_model_path: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4_actor_1230-975b46d
	strategy: fsdp
	fsdp_config:
	param_offload: false
	wrap_policy:
	min_num_params: 0
	log_prob_micro_batch_size: null
	log_prob_micro_batch_size_per_gpu: 64
	log_prob_use_dynamic_bsz: false
	log_prob_max_token_len_per_gpu: 16384
	ulysses_sequence_parallel_size: 1
	rollout:
	name: vllm
	temperature: 1.0
	top_k: -1
	top_p: 1
	use_fire_sampling: false
	prompt_length: 1024
	response_length: 3072
	dtype: bfloat16
	gpu_memory_utilization: 0.75
	ignore_eos: false
	enforce_eager: false
	free_cache_engine: false
	load_format: dummy_dtensor
	tensor_model_parallel_size: 4
	max_num_batched_tokens: 4096
	max_model_len: null
	max_num_seqs: 1024
	log_prob_micro_batch_size: null
	log_prob_micro_batch_size_per_gpu: 64
	log_prob_use_dynamic_bsz: false
	log_prob_max_token_len_per_gpu: 16384
	disable_log_stats: true
	enable_chunked_prefill: true
	do_sample: true
	'n': 4
	engine_kwargs:
	swap_space: null
	val_kwargs:
	top_k: -1
	top_p: 1.0
	temperature: 0
	'n': 1
	do_sample: false
	critic:
	rollout_n: 4
	strategy: fsdp
	optim:
	lr: 1.0e-05
	lr_warmup_steps_ratio: 0.0
	min_lr_ratio: null
	warmup_style: constant
	total_training_steps: 2000
	weight_decay: 0.01
	model:
	path: ~/models/deepseek-llm-7b-chat
	tokenizer_path: Qwen/Qwen2.5-Math-1.5B
	override_config: {}
	external_lib: null
	enable_gradient_checkpointing: true
	use_remove_padding: false
	fsdp_config:
	param_offload: false
	optimizer_offload: false
	wrap_policy:
	min_num_params: 0
	fsdp_size: -1
	save_hf_repo_id: null
	ppo_mini_batch_size: 128
	ppo_micro_batch_size: null
	ppo_micro_batch_size_per_gpu: null
	forward_micro_batch_size: null
	forward_micro_batch_size_per_gpu: null
	use_dynamic_bsz: false
	ppo_max_token_len_per_gpu: 32768
	forward_max_token_len_per_gpu: 32768
	ulysses_sequence_parallel_size: 1
	ppo_epochs: 1
	shuffle: false
	grad_clip: 1.0
	cliprange_value: 0.5
	checkpoint:
	contents:
	- model
	- optimizer
	- extra
	reward_model:
	enable: false
	strategy: fsdp
	model:
	input_tokenizer: Qwen/Qwen2.5-Math-1.5B
	path: ~/models/FsfairX-LLaMA3-RM-v0.1
	external_lib: null
	use_remove_padding: false
	fsdp_config:
	wrap_policy:
	min_num_params: 0
	param_offload: false
	fsdp_size: -1
	micro_batch_size: null
	micro_batch_size_per_gpu: null
	max_length: null
	ulysses_sequence_parallel_size: 1
	use_dynamic_bsz: false
	forward_max_token_len_per_gpu: 32768
	reward_manager: prime
	custom_reward_function:
	path: null
	name: compute_score
	algorithm:
	gamma: 1.0
	lam: 1.0
	adv_estimator: grpo_neg
	use_kl_in_reward: false
	grpo_neg:
	mean_penalty: 0.001
	kl_penalty: kl
	kl_ctrl:
	type: fixed
	kl_coef: 0.001
	horizon: 10000
	target_kl: 0.1
	trainer:
	balance_batch: true
	total_epochs: 1000000000000
	total_training_steps: 2000
	project_name: value-LLM
	experiment_name: grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
	logger:
	- console
	- wandb
	log_val_generations: 0
	nnodes: 1
	n_gpus_per_node: 4
	save_freq: 5
	resume_mode: auto
	resume_from_path: null
	val_before_train: false
	test_freq: -1
	critic_warmup: 0
	default_hdfs_dir: null
	del_local_ckpt_after_load: false
	default_local_dir: checkpoints/value-LLM/grpo_neg0.001-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
	max_actor_ckpt_to_keep: 1
	max_critic_ckpt_to_keep: 1
	ray_wait_register_center_timeout: 300
	hf_token: null
	resume_from_hf:
	enable: true
	actor_hf_repo_id: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref895-82bb89a_actor
	actor_revision: 975b46d1ee3ee658c46b85220a34a95c384f4078
	critic_hf_repo_id: null
	critic_revision: main
	hf_token: null