Spaces:

hashvibe007
/

smollm2

Sleeping

App Files Files Community

smollm2 / config_smollm2_135.yaml

Vibi007

first commit

035761e about 2 months ago

raw

history blame contribute delete

3.44 kB

	checkpoints:
	checkpoint_interval: 2000
	checkpoints_path: checkpoints
	checkpoints_path_is_shared_file_system: false
	resume_checkpoint_path: null
	save_final_state: false
	save_initial_state: false
	data_stages:
	- data:
	dataset:
	dataset_folder:
	- datasets/smollm2-corpus
	dataset_weights:
	- 1.0
	num_loading_workers: 0
	seed: 8
	name: stable phase
	start_training_step: 1
	general:
	benchmark_csv_path: null
	consumed_train_samples: null
	ignore_sanity_checks: true
	project: smollm2
	run: smollm2-135M
	seed: 8
	step: null
	logging:
	iteration_step_info_interval: 1
	log_level: info
	log_level_replica: info
	model:
	ddp_bucket_cap_mb: 25
	dtype: bfloat16
	init_method:
	std: 0.041666666666666664
	make_vocab_size_divisible_by: 1
	model_config:
	bos_token_id: 0
	eos_token_id: 0
	hidden_act: silu
	hidden_size: 576
	initializer_range: 0.041666666666666664
	intermediate_size: 1536
	is_llama_config: true
	max_position_embeddings: 2048
	num_attention_heads: 9
	num_hidden_layers: 30
	num_key_value_heads: 3
	pad_token_id: null
	pretraining_tp: 1
	rms_norm_eps: 1.0e-05
	rope_interleaved: false
	rope_scaling: null
	rope_theta: 10000.0
	tie_word_embeddings: true
	use_cache: true
	vocab_size: 49152
	optimizer:
	accumulate_grad_in_fp32: true
	clip_grad: 1.0
	learning_rate_scheduler:
	learning_rate: 0.003
	lr_decay_starting_step: 1600000
	lr_decay_steps: 400000
	lr_decay_style: linear
	lr_warmup_steps: 2000
	lr_warmup_style: linear
	min_decay_lr: 0
	optimizer_factory:
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_eps: 1.0e-08
	name: adamW
	torch_adam_is_fused: true
	weight_decay: 0.01
	zero_stage: 0
	parallelism:
	dp: 64
	expert_parallel_size: 1
	pp: 1
	pp_engine: 1f1b
	recompute_layer: false
	tp: 1
	tp_linear_async_communication: true
	tp_mode: REDUCE_SCATTER
	tp_recompute_allgather: true
	profiler: null
	tokenizer:
	tokenizer_max_length: null
	tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
	tokenizer_revision: null
	tokens:
	batch_accumulation_per_replica: 1
	limit_test_batches: 0
	limit_val_batches: 0
	micro_batch_size: 8
	sequence_length: 2048
	train_steps: 2000000
	val_check_interval: 1000

	# model:

	# LlamaForCausalLM(
	# (model): LlamaModel(
	# (embed_tokens): Embedding(49152, 576)
	# (layers): ModuleList(
	# (0-29): 30 x LlamaDecoderLayer(
	# (self_attn): LlamaAttention(
	# (q_proj): Linear(in_features=576, out_features=576, bias=False)
	# (k_proj): Linear(in_features=576, out_features=192, bias=False)
	# (v_proj): Linear(in_features=576, out_features=192, bias=False)
	# (o_proj): Linear(in_features=576, out_features=576, bias=False)
	# )
	# (mlp): LlamaMLP(
	# (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
	# (up_proj): Linear(in_features=576, out_features=1536, bias=False)
	# (down_proj): Linear(in_features=1536, out_features=576, bias=False)
	# (act_fn): SiLU()
	# )
	# (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
	# (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
	# )
	# )
	# (norm): LlamaRMSNorm((576,), eps=1e-05)
	# (rotary_emb): LlamaRotaryEmbedding()
	# )
	# (lm_head): Linear(in_features=576, out_features=49152, bias=False)
	# )