ridger
/

340M-UT2-100B-0.01

Model card Files Files and versions

340M-UT2-100B-0.01 / config.yaml

ridger's picture

Upload config.yaml with huggingface_hub

e2adf00 verified 4 months ago

3.31 kB

	checkpoints:
	checkpoint_interval: 10000
	checkpoints_path: checkpoints/360M_UT2_original_withcos_early0.01
	checkpoints_path_is_shared_file_system: false
	load_lr_scheduler: false
	load_optimizer: false
	resume_checkpoint_path: /mnt/bn/ridger1/yxy/SmolLM2-nanotron-ckpt/360M/pre-decay
	save_final_state: false
	save_initial_state: false
	data_stages:
	- data:
	dataset:
	dataset_folder:
	- /mnt/bn/ridger1/yxy/datasets/finewebedu-dedup
	- /mnt/bn/ridger1/datasets/cosmopedia-v2
	- /mnt/bn/ridger1/yxy/datasets/megamath/megamath-text-code-block
	- /mnt/bn/ridger1/yxy/datasets/megamath/megamath-qa
	- /mnt/bn/ridger1/datasets/megamath/megamath-translated-code
	- /mnt/bn/ridger1/yxy/datasets/megamath/megamath-web-pro
	dataset_weights:
	- 0.545
	- 0.08
	- 0.25
	- 0.035
	- 0.035
	- 0.075
	token_size_in_bytes: 2
	tokenizer_name: HuggingFaceTB/cosmo2-tokenizer
	vocab_size: 49152
	num_loading_workers: 1
	seed: 42
	name: stable phase
	start_training_step: 1
	general:
	benchmark_csv_path: null
	consumed_train_samples: 51200000
	ignore_sanity_checks: true
	project: diffUT
	run: 360M_32_with_cos_constant_and_decay_early0.01
	seed: 8
	step: 100000
	lighteval: null
	logging:
	iteration_step_info_interval: 1
	log_level: info
	log_level_replica: info
	model:
	ddp_bucket_cap_mb: 25
	dtype: bfloat16
	init_method:
	std: 0.041666666666666664
	make_vocab_size_divisible_by: 1
	model_config:
	_attn_implementation: flash_attention_2
	attention_bias: false
	bos_token_id: 0
	early_ratio: 0.01
	eos_token_id: 0
	hidden_act: silu
	hidden_size: 960
	initializer_range: 0.02
	intermediate_size: 2560
	is_llama_config: true
	max_position_embeddings: 2048
	num_attention_heads: 15
	num_hidden_layers: 32
	num_key_value_heads: 5
	pad_token_id: null
	pretraining_tp: 1
	rms_norm_eps: 1.0e-05
	rope_interleaved: false
	rope_scaling: null
	rope_theta: 10000.0
	tie_word_embeddings: true
	unroll: true
	unroll_end: 25
	unroll_start: 5
	unroll_type: 2
	use_cache: true
	vocab_size: 49152
	z_loss_coefficient: 0.0001
	z_loss_enabled: false
	optimizer:
	accumulate_grad_in_fp32: true
	clip_grad: 1.0
	learning_rate_scheduler:
	learning_rate: 0.003
	lr_decay_starting_step: 10000
	lr_decay_steps: 100000
	lr_decay_style: linear
	lr_warmup_steps: 1024
	lr_warmup_style: linear
	min_decay_lr: 0.0003
	optimizer_factory:
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_eps: 1.0e-08
	name: adamW
	torch_adam_is_fused: true
	weight_decay: 0.01
	weight_decay_exclude_named_params: []
	zero_stage: 1
	parallelism:
	context_parallel_size: 1
	dp: 32
	expert_parallel_size: 1
	moe_layer_recompute: false
	pp: 1
	pp_engine: 1f1b
	recompute_layer: false
	tp: 1
	tp_linear_async_communication: true
	tp_mode: REDUCE_SCATTER
	tp_recompute_allgather: true
	profiler: null
	s3_upload: null
	tokenizer:
	tokenizer_max_length: null
	tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
	tokenizer_revision: null
	tokens:
	batch_accumulation_per_replica: 4
	limit_test_batches: 0
	limit_val_batches: 0
	micro_batch_size: 4
	sequence_length: 2048
	train_steps: 102400
	val_check_interval: 10000