MolmoAct-7B-D-Pretrain-0812 / model.yaml

update weights from bf16 to fp32

8fd5f92 about 2 months ago

9.37 kB

	model_name: molmo
	llm:
	d_model: 3584
	n_heads: 28
	n_kv_heads: 4
	head_dim: null
	qkv_bias: true
	clip_qkv: null
	n_layers: 28
	mlp_ratio: 4
	mlp_hidden_size: 37888
	activation_type: swiglu
	block_type: sequential
	rope: true
	rope_full_precision: true
	rope_theta: 1000000.0
	rope_type: default
	rope_factor: null
	rope_high_freq_factor: null
	rope_low_freq_factor: null
	rope_original_max_position_embeddings: null
	attention_type: sdpa
	float32_attention: true
	attention_dropout: 0.0
	attention_layer_norm: false
	attention_layer_norm_type: olmo
	residual_dropout: 0.1
	response_residual_dropout: 0.0
	layer_norm_type: rms
	layer_norm_with_affine: true
	layer_norm_eps: 1.0e-06
	attention_layer_norm_with_affine: true
	max_sequence_length: 4096
	max_position_embeddings: null
	include_bias: false
	bias_for_layer_norm: null
	norm_after: false
	moe_num_experts: 8
	moe_top_k: 2
	moe_mlp_impl: sparse
	moe_log_expert_assignment: false
	moe_shared_expert: false
	moe_lbl_in_fp32: false
	moe_interleave: false
	moe_loss_weight: 0.1
	moe_zloss_weight: null
	moe_dropless: true
	moe_capacity_factor: 1.25
	embedding_dropout: 0.0
	scale_logits: false
	vocab_size: 152064
	additional_vocab_size: 128
	weight_tying: false
	embedding_size: 152064
	use_position_ids: true
	tokenizer:
	identifier: Qwen/Qwen2.5-7B
	tokenizer_dir: null
	depth_tokens: true
	init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
	init_incremental: null
	new_embedding_init_range: 0.02
	initializer_range: 0.02
	normalize_input_embeds: false
	activation_checkpoint: whole_layer
	compile: blocks
	fix_pad_tokenizer: false
	resize_vocab: false
	init_std: 0.02
	init_fn: normal
	init_cutoff_factor: null
	vision_backbone:
	vit:
	image_model_type: siglip
	image_default_input_size:
	- 378
	- 378
	image_patch_size: 14
	image_pos_patch_size: 14
	image_emb_dim: 1152
	image_num_heads: 16
	image_num_key_value_heads: 16
	image_num_layers: 27
	image_head_dim: 72
	image_mlp_dim: 4304
	image_mlp_activations: gelu_pytorch_tanh
	image_dropout_rate: 0.0
	image_num_pos: 729
	image_norm_eps: 1.0e-06
	attention_dropout: 0.0
	residual_dropout: 0.0
	initializer_range: 0.02
	float32_attention: true
	attention_type: sdpa
	activation_checkpointing: true
	init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
	resize_mode: siglip
	pad_value: 0.0
	normalize: siglip
	image_pooling_2d: attention_meanq
	pooling_attention_mask: false
	image_projector: mlp
	image_padding_embed: null
	vit_layers:
	- -3
	- -9
	skip_unused_layers: true
	image_feature_dropout: 0.0
	connector_activation_checkpointing: true
	compile_vit: blocks
	data_formatter:
	prompt_templates: uber_model
	message_format: role
	system_prompt: demo_or_style
	always_start_with_space: false
	default_inference_len: 65
	select_answer: best
	debug: false
	image_last: false
	format_message_list: null
	p_one_message: 0.0
	mm_preprocessor:
	crop_mode: overlap-and-resize-c2
	max_crops: 8
	max_images: 1
	max_multi_image_crops: 8
	pooling_w: 2
	pooling_h: 2
	overlap_margins:
	- 4
	- 4
	use_col_tokens: true
	loss_token_weighting: root_subsegments
	legacy_image_mask: false
	max_answer_len: null
	img_aug: false
	bi_directional_attn: null
	lora_enable: false
	lora_rank: 64
	lora_alpha: 16
	lora_dropout: 0.05
	lora_bias: none
	n_action_bins: 256
	norm_stats:
	fractal20220817_data:
	action:
	mean:
	- 0.006987582892179489
	- 0.006265917327255011
	- -0.01262515690177679
	- 0.04333311319351196
	- -0.005756212864071131
	- 0.0009130256366916001
	- 0.5354204773902893
	std:
	- 0.0692116990685463
	- 0.05970962345600128
	- 0.07353084534406662
	- 0.15610496699810028
	- 0.13164450228214264
	- 0.14593800902366638
	- 0.497110515832901
	max:
	- 2.9984593391418457
	- 22.09052848815918
	- 2.7507524490356445
	- 1.570636510848999
	- 1.5321086645126343
	- 1.5691522359848022
	- 1.0
	min:
	- -2.0204520225524902
	- -5.497899532318115
	- -2.031663417816162
	- -1.569917917251587
	- -1.569892168045044
	- -1.570419430732727
	- 0.0
	q01:
	- -0.22453527510166169
	- -0.14820013284683228
	- -0.231589707583189
	- -0.3517994859814644
	- -0.4193011274933815
	- -0.43643461108207704
	- 0.0
	q99:
	- 0.17824687153100965
	- 0.14938379630446405
	- 0.21842354819178575
	- 0.5892666035890578
	- 0.35272657424211445
	- 0.44796681255102094
	- 1.0
	mask:
	- true
	- true
	- true
	- true
	- true
	- true
	- false
	proprio:
	mean:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	std:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	max:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	min:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	q01:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	q99:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	num_transitions: 3786400
	num_trajectories: 87212
	bridge_orig:
	action:
	mean:
	- 0.0002334194869035855
	- 0.00013004911306779832
	- -0.00012762474943883717
	- -0.0001556558854645118
	- -0.0004039328487124294
	- 0.00023557482927571982
	- 0.5764579176902771
	std:
	- 0.009765930473804474
	- 0.013689135201275349
	- 0.012667362578213215
	- 0.028534092009067535
	- 0.030637972056865692
	- 0.07691419124603271
	- 0.4973701536655426
	max:
	- 0.41691166162490845
	- 0.25864794850349426
	- 0.21218234300613403
	- 3.122201919555664
	- 1.8618112802505493
	- 6.280478477478027
	- 1.0
	min:
	- -0.4007510244846344
	- -0.13874775171279907
	- -0.22553899884223938
	- -3.2010786533355713
	- -1.8618112802505493
	- -6.279075622558594
	- 0.0
	q01:
	- -0.02872725307941437
	- -0.04170349963009357
	- -0.026093858778476715
	- -0.08092105075716972
	- -0.09288699507713317
	- -0.20718276381492615
	- 0.0
	q99:
	- 0.028309678435325586
	- 0.040855254605412394
	- 0.040161586627364146
	- 0.08192047759890528
	- 0.07792850524187081
	- 0.20382574498653397
	- 1.0
	mask:
	- true
	- true
	- true
	- true
	- true
	- true
	- false
	proprio:
	mean:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	std:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	max:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	min:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	q01:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	q99:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	num_transitions: 2135463
	num_trajectories: 60064
	bc_z:
	action:
	mean:
	- -0.009958467446267605
	- 0.0008958321413956583
	- 0.004995597992092371
	- 0.00029755113064311445
	- -0.008735382929444313
	- -0.030693737789988518
	- 0.8344562649726868
	std:
	- 0.03053455986082554
	- 0.0231423731893301
	- 0.020641816779971123
	- 0.04155943542718887
	- 0.046427831053733826
	- 0.0769818127155304
	- 0.3610210120677948
	max:
	- 0.2165454924106598
	- 0.1251407265663147
	- 0.10772687941789627
	- 0.33544227480888367
	- 0.28117990493774414
	- 0.40614867210388184
	- 1.0
	min:
	- -0.1677047461271286
	- -0.14630407094955444
	- -0.10066790133714676
	- -0.29421567916870117
	- -0.32101404666900635
	- -0.4635624885559082
	- 0.0
	q01:
	- -0.09220654994249344
	- -0.06456145539879798
	- -0.049121275544166565
	- -0.11594625547528267
	- -0.14152548640966414
	- -0.2251061636209488
	- 0.0
	q99:
	- 0.07628866866230968
	- 0.058019736707210584
	- 0.052540797740221024
	- 0.11740604028105736
	- 0.11703975558280955
	- 0.16729306846857078
	- 1.0
	mask:
	- true
	- true
	- true
	- true
	- true
	- true
	- false
	proprio:
	mean:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	std:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	max:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	min:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	q01:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	q99:
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	- 0.0
	num_transitions: 6015535
	num_trajectories: 43264