hqfang's picture
update weights from bf16 to fp32
8fd5f92
model_name: molmo
llm:
d_model: 3584
n_heads: 28
n_kv_heads: 4
head_dim: null
qkv_bias: true
clip_qkv: null
n_layers: 28
mlp_ratio: 4
mlp_hidden_size: 37888
activation_type: swiglu
block_type: sequential
rope: true
rope_full_precision: true
rope_theta: 1000000.0
rope_type: default
rope_factor: null
rope_high_freq_factor: null
rope_low_freq_factor: null
rope_original_max_position_embeddings: null
attention_type: sdpa
float32_attention: true
attention_dropout: 0.0
attention_layer_norm: false
attention_layer_norm_type: olmo
residual_dropout: 0.1
response_residual_dropout: 0.0
layer_norm_type: rms
layer_norm_with_affine: true
layer_norm_eps: 1.0e-06
attention_layer_norm_with_affine: true
max_sequence_length: 4096
max_position_embeddings: null
include_bias: false
bias_for_layer_norm: null
norm_after: false
moe_num_experts: 8
moe_top_k: 2
moe_mlp_impl: sparse
moe_log_expert_assignment: false
moe_shared_expert: false
moe_lbl_in_fp32: false
moe_interleave: false
moe_loss_weight: 0.1
moe_zloss_weight: null
moe_dropless: true
moe_capacity_factor: 1.25
embedding_dropout: 0.0
scale_logits: false
vocab_size: 152064
additional_vocab_size: 128
weight_tying: false
embedding_size: 152064
use_position_ids: true
tokenizer:
identifier: Qwen/Qwen2.5-7B
tokenizer_dir: null
depth_tokens: true
init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
init_incremental: null
new_embedding_init_range: 0.02
initializer_range: 0.02
normalize_input_embeds: false
activation_checkpoint: whole_layer
compile: blocks
fix_pad_tokenizer: false
resize_vocab: false
init_std: 0.02
init_fn: normal
init_cutoff_factor: null
vision_backbone:
vit:
image_model_type: siglip
image_default_input_size:
- 378
- 378
image_patch_size: 14
image_pos_patch_size: 14
image_emb_dim: 1152
image_num_heads: 16
image_num_key_value_heads: 16
image_num_layers: 27
image_head_dim: 72
image_mlp_dim: 4304
image_mlp_activations: gelu_pytorch_tanh
image_dropout_rate: 0.0
image_num_pos: 729
image_norm_eps: 1.0e-06
attention_dropout: 0.0
residual_dropout: 0.0
initializer_range: 0.02
float32_attention: true
attention_type: sdpa
activation_checkpointing: true
init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
resize_mode: siglip
pad_value: 0.0
normalize: siglip
image_pooling_2d: attention_meanq
pooling_attention_mask: false
image_projector: mlp
image_padding_embed: null
vit_layers:
- -3
- -9
skip_unused_layers: true
image_feature_dropout: 0.0
connector_activation_checkpointing: true
compile_vit: blocks
data_formatter:
prompt_templates: uber_model
message_format: role
system_prompt: demo_or_style
always_start_with_space: false
default_inference_len: 65
select_answer: best
debug: false
image_last: false
format_message_list: null
p_one_message: 0.0
mm_preprocessor:
crop_mode: overlap-and-resize-c2
max_crops: 8
max_images: 1
max_multi_image_crops: 8
pooling_w: 2
pooling_h: 2
overlap_margins:
- 4
- 4
use_col_tokens: true
loss_token_weighting: root_subsegments
legacy_image_mask: false
max_answer_len: null
img_aug: false
bi_directional_attn: null
lora_enable: false
lora_rank: 64
lora_alpha: 16
lora_dropout: 0.05
lora_bias: none
n_action_bins: 256
norm_stats:
fractal20220817_data:
action:
mean:
- 0.006987582892179489
- 0.006265917327255011
- -0.01262515690177679
- 0.04333311319351196
- -0.005756212864071131
- 0.0009130256366916001
- 0.5354204773902893
std:
- 0.0692116990685463
- 0.05970962345600128
- 0.07353084534406662
- 0.15610496699810028
- 0.13164450228214264
- 0.14593800902366638
- 0.497110515832901
max:
- 2.9984593391418457
- 22.09052848815918
- 2.7507524490356445
- 1.570636510848999
- 1.5321086645126343
- 1.5691522359848022
- 1.0
min:
- -2.0204520225524902
- -5.497899532318115
- -2.031663417816162
- -1.569917917251587
- -1.569892168045044
- -1.570419430732727
- 0.0
q01:
- -0.22453527510166169
- -0.14820013284683228
- -0.231589707583189
- -0.3517994859814644
- -0.4193011274933815
- -0.43643461108207704
- 0.0
q99:
- 0.17824687153100965
- 0.14938379630446405
- 0.21842354819178575
- 0.5892666035890578
- 0.35272657424211445
- 0.44796681255102094
- 1.0
mask:
- true
- true
- true
- true
- true
- true
- false
proprio:
mean:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
std:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
max:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
min:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q01:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q99:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
num_transitions: 3786400
num_trajectories: 87212
bridge_orig:
action:
mean:
- 0.0002334194869035855
- 0.00013004911306779832
- -0.00012762474943883717
- -0.0001556558854645118
- -0.0004039328487124294
- 0.00023557482927571982
- 0.5764579176902771
std:
- 0.009765930473804474
- 0.013689135201275349
- 0.012667362578213215
- 0.028534092009067535
- 0.030637972056865692
- 0.07691419124603271
- 0.4973701536655426
max:
- 0.41691166162490845
- 0.25864794850349426
- 0.21218234300613403
- 3.122201919555664
- 1.8618112802505493
- 6.280478477478027
- 1.0
min:
- -0.4007510244846344
- -0.13874775171279907
- -0.22553899884223938
- -3.2010786533355713
- -1.8618112802505493
- -6.279075622558594
- 0.0
q01:
- -0.02872725307941437
- -0.04170349963009357
- -0.026093858778476715
- -0.08092105075716972
- -0.09288699507713317
- -0.20718276381492615
- 0.0
q99:
- 0.028309678435325586
- 0.040855254605412394
- 0.040161586627364146
- 0.08192047759890528
- 0.07792850524187081
- 0.20382574498653397
- 1.0
mask:
- true
- true
- true
- true
- true
- true
- false
proprio:
mean:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
std:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
max:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
min:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q01:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q99:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
num_transitions: 2135463
num_trajectories: 60064
bc_z:
action:
mean:
- -0.009958467446267605
- 0.0008958321413956583
- 0.004995597992092371
- 0.00029755113064311445
- -0.008735382929444313
- -0.030693737789988518
- 0.8344562649726868
std:
- 0.03053455986082554
- 0.0231423731893301
- 0.020641816779971123
- 0.04155943542718887
- 0.046427831053733826
- 0.0769818127155304
- 0.3610210120677948
max:
- 0.2165454924106598
- 0.1251407265663147
- 0.10772687941789627
- 0.33544227480888367
- 0.28117990493774414
- 0.40614867210388184
- 1.0
min:
- -0.1677047461271286
- -0.14630407094955444
- -0.10066790133714676
- -0.29421567916870117
- -0.32101404666900635
- -0.4635624885559082
- 0.0
q01:
- -0.09220654994249344
- -0.06456145539879798
- -0.049121275544166565
- -0.11594625547528267
- -0.14152548640966414
- -0.2251061636209488
- 0.0
q99:
- 0.07628866866230968
- 0.058019736707210584
- 0.052540797740221024
- 0.11740604028105736
- 0.11703975558280955
- 0.16729306846857078
- 1.0
mask:
- true
- true
- true
- true
- true
- true
- false
proprio:
mean:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
std:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
max:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
min:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q01:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q99:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
num_transitions: 6015535
num_trajectories: 43264