hqfang commited on
Commit
8fd5f92
·
1 Parent(s): 83e5635

update weights from bf16 to fp32

Browse files
config.json CHANGED
@@ -433,7 +433,7 @@
433
  }
434
  },
435
  "tie_word_embeddings": false,
436
- "torch_dtype": "bfloat16",
437
  "transformers_version": "4.52.3",
438
  "use_cache": true,
439
  "vit_config": {
 
433
  }
434
  },
435
  "tie_word_embeddings": false,
436
+ "torch_dtype": "float32",
437
  "transformers_version": "4.52.3",
438
  "use_cache": true,
439
  "vit_config": {
model-00001-of-00004.safetensors → model-00001-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76281371a7ea8914c059546488e2df506b83f54b50cf78e2ce03fac8e08512cc
3
- size 4878581216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d82f13698d52a6282577b6bac6e8159b9e3daa1d02db839dbd2f8f39ffcdfd7b
3
+ size 4978520816
model-00002-of-00004.safetensors → model-00002-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73627dc1647f2bd8c39a4cdf27d5904839d0a7d7f62d6baaf4a0d8f40bf61bee
3
- size 4932745864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec2c55ce78373053f50cfb985e30bf79b7aa1798b2e569c785defed68afbe472
3
+ size 4778633920
model-00003-of-00004.safetensors → model-00003-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5edc3157278b7131345aa7cc4c9e302eebf8f6bc32c86271bf0bc3cd9fabf43a
3
- size 4994552920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4f1f23b18f445d3c0a7425ba624e051113e2c27fc6659a532afcd343d2e0ed
3
+ size 4661160168
model-00004-of-00004.safetensors → model-00004-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6e5bf65225d4e32fca54954d2e6c8b1a8b28eda7799675a747c9b10fcfca83d
3
- size 1433042592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a250d74012468aacd85c4ba9402e3c738c71f72c730801b16cd8689a0d151968
3
+ size 4661160192
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8938b6e27976a73b4051dbba5df29d7de86a9170d432421e4f7b03d8c93830d8
3
+ size 4661160192
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a593ab95678081a9c8f40bceee8f92e9a1f35fdd30b42059152a8e0dee2cfde
3
+ size 4997750712
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:990f8fece2113521a241f6b31401564b198f9e285a51f2d584458b286bba1ea7
3
+ size 3739371680
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
model.yaml ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: molmo
2
+ llm:
3
+ d_model: 3584
4
+ n_heads: 28
5
+ n_kv_heads: 4
6
+ head_dim: null
7
+ qkv_bias: true
8
+ clip_qkv: null
9
+ n_layers: 28
10
+ mlp_ratio: 4
11
+ mlp_hidden_size: 37888
12
+ activation_type: swiglu
13
+ block_type: sequential
14
+ rope: true
15
+ rope_full_precision: true
16
+ rope_theta: 1000000.0
17
+ rope_type: default
18
+ rope_factor: null
19
+ rope_high_freq_factor: null
20
+ rope_low_freq_factor: null
21
+ rope_original_max_position_embeddings: null
22
+ attention_type: sdpa
23
+ float32_attention: true
24
+ attention_dropout: 0.0
25
+ attention_layer_norm: false
26
+ attention_layer_norm_type: olmo
27
+ residual_dropout: 0.1
28
+ response_residual_dropout: 0.0
29
+ layer_norm_type: rms
30
+ layer_norm_with_affine: true
31
+ layer_norm_eps: 1.0e-06
32
+ attention_layer_norm_with_affine: true
33
+ max_sequence_length: 4096
34
+ max_position_embeddings: null
35
+ include_bias: false
36
+ bias_for_layer_norm: null
37
+ norm_after: false
38
+ moe_num_experts: 8
39
+ moe_top_k: 2
40
+ moe_mlp_impl: sparse
41
+ moe_log_expert_assignment: false
42
+ moe_shared_expert: false
43
+ moe_lbl_in_fp32: false
44
+ moe_interleave: false
45
+ moe_loss_weight: 0.1
46
+ moe_zloss_weight: null
47
+ moe_dropless: true
48
+ moe_capacity_factor: 1.25
49
+ embedding_dropout: 0.0
50
+ scale_logits: false
51
+ vocab_size: 152064
52
+ additional_vocab_size: 128
53
+ weight_tying: false
54
+ embedding_size: 152064
55
+ use_position_ids: true
56
+ tokenizer:
57
+ identifier: Qwen/Qwen2.5-7B
58
+ tokenizer_dir: null
59
+ depth_tokens: true
60
+ init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
61
+ init_incremental: null
62
+ new_embedding_init_range: 0.02
63
+ initializer_range: 0.02
64
+ normalize_input_embeds: false
65
+ activation_checkpoint: whole_layer
66
+ compile: blocks
67
+ fix_pad_tokenizer: false
68
+ resize_vocab: false
69
+ init_std: 0.02
70
+ init_fn: normal
71
+ init_cutoff_factor: null
72
+ vision_backbone:
73
+ vit:
74
+ image_model_type: siglip
75
+ image_default_input_size:
76
+ - 378
77
+ - 378
78
+ image_patch_size: 14
79
+ image_pos_patch_size: 14
80
+ image_emb_dim: 1152
81
+ image_num_heads: 16
82
+ image_num_key_value_heads: 16
83
+ image_num_layers: 27
84
+ image_head_dim: 72
85
+ image_mlp_dim: 4304
86
+ image_mlp_activations: gelu_pytorch_tanh
87
+ image_dropout_rate: 0.0
88
+ image_num_pos: 729
89
+ image_norm_eps: 1.0e-06
90
+ attention_dropout: 0.0
91
+ residual_dropout: 0.0
92
+ initializer_range: 0.02
93
+ float32_attention: true
94
+ attention_type: sdpa
95
+ activation_checkpointing: true
96
+ init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
97
+ resize_mode: siglip
98
+ pad_value: 0.0
99
+ normalize: siglip
100
+ image_pooling_2d: attention_meanq
101
+ pooling_attention_mask: false
102
+ image_projector: mlp
103
+ image_padding_embed: null
104
+ vit_layers:
105
+ - -3
106
+ - -9
107
+ skip_unused_layers: true
108
+ image_feature_dropout: 0.0
109
+ connector_activation_checkpointing: true
110
+ compile_vit: blocks
111
+ data_formatter:
112
+ prompt_templates: uber_model
113
+ message_format: role
114
+ system_prompt: demo_or_style
115
+ always_start_with_space: false
116
+ default_inference_len: 65
117
+ select_answer: best
118
+ debug: false
119
+ image_last: false
120
+ format_message_list: null
121
+ p_one_message: 0.0
122
+ mm_preprocessor:
123
+ crop_mode: overlap-and-resize-c2
124
+ max_crops: 8
125
+ max_images: 1
126
+ max_multi_image_crops: 8
127
+ pooling_w: 2
128
+ pooling_h: 2
129
+ overlap_margins:
130
+ - 4
131
+ - 4
132
+ use_col_tokens: true
133
+ loss_token_weighting: root_subsegments
134
+ legacy_image_mask: false
135
+ max_answer_len: null
136
+ img_aug: false
137
+ bi_directional_attn: null
138
+ lora_enable: false
139
+ lora_rank: 64
140
+ lora_alpha: 16
141
+ lora_dropout: 0.05
142
+ lora_bias: none
143
+ n_action_bins: 256
144
+ norm_stats:
145
+ fractal20220817_data:
146
+ action:
147
+ mean:
148
+ - 0.006987582892179489
149
+ - 0.006265917327255011
150
+ - -0.01262515690177679
151
+ - 0.04333311319351196
152
+ - -0.005756212864071131
153
+ - 0.0009130256366916001
154
+ - 0.5354204773902893
155
+ std:
156
+ - 0.0692116990685463
157
+ - 0.05970962345600128
158
+ - 0.07353084534406662
159
+ - 0.15610496699810028
160
+ - 0.13164450228214264
161
+ - 0.14593800902366638
162
+ - 0.497110515832901
163
+ max:
164
+ - 2.9984593391418457
165
+ - 22.09052848815918
166
+ - 2.7507524490356445
167
+ - 1.570636510848999
168
+ - 1.5321086645126343
169
+ - 1.5691522359848022
170
+ - 1.0
171
+ min:
172
+ - -2.0204520225524902
173
+ - -5.497899532318115
174
+ - -2.031663417816162
175
+ - -1.569917917251587
176
+ - -1.569892168045044
177
+ - -1.570419430732727
178
+ - 0.0
179
+ q01:
180
+ - -0.22453527510166169
181
+ - -0.14820013284683228
182
+ - -0.231589707583189
183
+ - -0.3517994859814644
184
+ - -0.4193011274933815
185
+ - -0.43643461108207704
186
+ - 0.0
187
+ q99:
188
+ - 0.17824687153100965
189
+ - 0.14938379630446405
190
+ - 0.21842354819178575
191
+ - 0.5892666035890578
192
+ - 0.35272657424211445
193
+ - 0.44796681255102094
194
+ - 1.0
195
+ mask:
196
+ - true
197
+ - true
198
+ - true
199
+ - true
200
+ - true
201
+ - true
202
+ - false
203
+ proprio:
204
+ mean:
205
+ - 0.0
206
+ - 0.0
207
+ - 0.0
208
+ - 0.0
209
+ - 0.0
210
+ - 0.0
211
+ - 0.0
212
+ std:
213
+ - 0.0
214
+ - 0.0
215
+ - 0.0
216
+ - 0.0
217
+ - 0.0
218
+ - 0.0
219
+ - 0.0
220
+ max:
221
+ - 0.0
222
+ - 0.0
223
+ - 0.0
224
+ - 0.0
225
+ - 0.0
226
+ - 0.0
227
+ - 0.0
228
+ min:
229
+ - 0.0
230
+ - 0.0
231
+ - 0.0
232
+ - 0.0
233
+ - 0.0
234
+ - 0.0
235
+ - 0.0
236
+ q01:
237
+ - 0.0
238
+ - 0.0
239
+ - 0.0
240
+ - 0.0
241
+ - 0.0
242
+ - 0.0
243
+ - 0.0
244
+ q99:
245
+ - 0.0
246
+ - 0.0
247
+ - 0.0
248
+ - 0.0
249
+ - 0.0
250
+ - 0.0
251
+ - 0.0
252
+ num_transitions: 3786400
253
+ num_trajectories: 87212
254
+ bridge_orig:
255
+ action:
256
+ mean:
257
+ - 0.0002334194869035855
258
+ - 0.00013004911306779832
259
+ - -0.00012762474943883717
260
+ - -0.0001556558854645118
261
+ - -0.0004039328487124294
262
+ - 0.00023557482927571982
263
+ - 0.5764579176902771
264
+ std:
265
+ - 0.009765930473804474
266
+ - 0.013689135201275349
267
+ - 0.012667362578213215
268
+ - 0.028534092009067535
269
+ - 0.030637972056865692
270
+ - 0.07691419124603271
271
+ - 0.4973701536655426
272
+ max:
273
+ - 0.41691166162490845
274
+ - 0.25864794850349426
275
+ - 0.21218234300613403
276
+ - 3.122201919555664
277
+ - 1.8618112802505493
278
+ - 6.280478477478027
279
+ - 1.0
280
+ min:
281
+ - -0.4007510244846344
282
+ - -0.13874775171279907
283
+ - -0.22553899884223938
284
+ - -3.2010786533355713
285
+ - -1.8618112802505493
286
+ - -6.279075622558594
287
+ - 0.0
288
+ q01:
289
+ - -0.02872725307941437
290
+ - -0.04170349963009357
291
+ - -0.026093858778476715
292
+ - -0.08092105075716972
293
+ - -0.09288699507713317
294
+ - -0.20718276381492615
295
+ - 0.0
296
+ q99:
297
+ - 0.028309678435325586
298
+ - 0.040855254605412394
299
+ - 0.040161586627364146
300
+ - 0.08192047759890528
301
+ - 0.07792850524187081
302
+ - 0.20382574498653397
303
+ - 1.0
304
+ mask:
305
+ - true
306
+ - true
307
+ - true
308
+ - true
309
+ - true
310
+ - true
311
+ - false
312
+ proprio:
313
+ mean:
314
+ - 0.0
315
+ - 0.0
316
+ - 0.0
317
+ - 0.0
318
+ - 0.0
319
+ - 0.0
320
+ - 0.0
321
+ std:
322
+ - 0.0
323
+ - 0.0
324
+ - 0.0
325
+ - 0.0
326
+ - 0.0
327
+ - 0.0
328
+ - 0.0
329
+ max:
330
+ - 0.0
331
+ - 0.0
332
+ - 0.0
333
+ - 0.0
334
+ - 0.0
335
+ - 0.0
336
+ - 0.0
337
+ min:
338
+ - 0.0
339
+ - 0.0
340
+ - 0.0
341
+ - 0.0
342
+ - 0.0
343
+ - 0.0
344
+ - 0.0
345
+ q01:
346
+ - 0.0
347
+ - 0.0
348
+ - 0.0
349
+ - 0.0
350
+ - 0.0
351
+ - 0.0
352
+ - 0.0
353
+ q99:
354
+ - 0.0
355
+ - 0.0
356
+ - 0.0
357
+ - 0.0
358
+ - 0.0
359
+ - 0.0
360
+ - 0.0
361
+ num_transitions: 2135463
362
+ num_trajectories: 60064
363
+ bc_z:
364
+ action:
365
+ mean:
366
+ - -0.009958467446267605
367
+ - 0.0008958321413956583
368
+ - 0.004995597992092371
369
+ - 0.00029755113064311445
370
+ - -0.008735382929444313
371
+ - -0.030693737789988518
372
+ - 0.8344562649726868
373
+ std:
374
+ - 0.03053455986082554
375
+ - 0.0231423731893301
376
+ - 0.020641816779971123
377
+ - 0.04155943542718887
378
+ - 0.046427831053733826
379
+ - 0.0769818127155304
380
+ - 0.3610210120677948
381
+ max:
382
+ - 0.2165454924106598
383
+ - 0.1251407265663147
384
+ - 0.10772687941789627
385
+ - 0.33544227480888367
386
+ - 0.28117990493774414
387
+ - 0.40614867210388184
388
+ - 1.0
389
+ min:
390
+ - -0.1677047461271286
391
+ - -0.14630407094955444
392
+ - -0.10066790133714676
393
+ - -0.29421567916870117
394
+ - -0.32101404666900635
395
+ - -0.4635624885559082
396
+ - 0.0
397
+ q01:
398
+ - -0.09220654994249344
399
+ - -0.06456145539879798
400
+ - -0.049121275544166565
401
+ - -0.11594625547528267
402
+ - -0.14152548640966414
403
+ - -0.2251061636209488
404
+ - 0.0
405
+ q99:
406
+ - 0.07628866866230968
407
+ - 0.058019736707210584
408
+ - 0.052540797740221024
409
+ - 0.11740604028105736
410
+ - 0.11703975558280955
411
+ - 0.16729306846857078
412
+ - 1.0
413
+ mask:
414
+ - true
415
+ - true
416
+ - true
417
+ - true
418
+ - true
419
+ - true
420
+ - false
421
+ proprio:
422
+ mean:
423
+ - 0.0
424
+ - 0.0
425
+ - 0.0
426
+ - 0.0
427
+ - 0.0
428
+ - 0.0
429
+ - 0.0
430
+ std:
431
+ - 0.0
432
+ - 0.0
433
+ - 0.0
434
+ - 0.0
435
+ - 0.0
436
+ - 0.0
437
+ - 0.0
438
+ max:
439
+ - 0.0
440
+ - 0.0
441
+ - 0.0
442
+ - 0.0
443
+ - 0.0
444
+ - 0.0
445
+ - 0.0
446
+ min:
447
+ - 0.0
448
+ - 0.0
449
+ - 0.0
450
+ - 0.0
451
+ - 0.0
452
+ - 0.0
453
+ - 0.0
454
+ q01:
455
+ - 0.0
456
+ - 0.0
457
+ - 0.0
458
+ - 0.0
459
+ - 0.0
460
+ - 0.0
461
+ - 0.0
462
+ q99:
463
+ - 0.0
464
+ - 0.0
465
+ - 0.0
466
+ - 0.0
467
+ - 0.0
468
+ - 0.0
469
+ - 0.0
470
+ num_transitions: 6015535
471
+ num_trajectories: 43264