swj0419 commited on
Commit
cd2b813
·
verified ·
1 Parent(s): 824d455

Upload OLMo-2 model checkpoint

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +129 -0
  2. .metadata.json +1 -0
  3. config.json +1 -0
  4. data_paths.txt +2 -0
  5. model_and_optim/.metadata +3 -0
  6. model_and_optim/__0_0.distcp +3 -0
  7. model_and_optim/__0_1.distcp +3 -0
  8. model_and_optim/__0_10.distcp +3 -0
  9. model_and_optim/__0_11.distcp +3 -0
  10. model_and_optim/__0_12.distcp +3 -0
  11. model_and_optim/__0_13.distcp +3 -0
  12. model_and_optim/__0_14.distcp +3 -0
  13. model_and_optim/__0_15.distcp +3 -0
  14. model_and_optim/__0_2.distcp +3 -0
  15. model_and_optim/__0_3.distcp +3 -0
  16. model_and_optim/__0_4.distcp +3 -0
  17. model_and_optim/__0_5.distcp +3 -0
  18. model_and_optim/__0_6.distcp +3 -0
  19. model_and_optim/__0_7.distcp +3 -0
  20. model_and_optim/__0_8.distcp +3 -0
  21. model_and_optim/__0_9.distcp +3 -0
  22. model_and_optim/__1_0.distcp +3 -0
  23. model_and_optim/__1_1.distcp +3 -0
  24. model_and_optim/__1_10.distcp +3 -0
  25. model_and_optim/__1_11.distcp +3 -0
  26. model_and_optim/__1_12.distcp +3 -0
  27. model_and_optim/__1_13.distcp +3 -0
  28. model_and_optim/__1_14.distcp +3 -0
  29. model_and_optim/__1_15.distcp +3 -0
  30. model_and_optim/__1_2.distcp +3 -0
  31. model_and_optim/__1_3.distcp +3 -0
  32. model_and_optim/__1_4.distcp +3 -0
  33. model_and_optim/__1_5.distcp +3 -0
  34. model_and_optim/__1_6.distcp +3 -0
  35. model_and_optim/__1_7.distcp +3 -0
  36. model_and_optim/__1_8.distcp +3 -0
  37. model_and_optim/__1_9.distcp +3 -0
  38. model_and_optim/__2_0.distcp +3 -0
  39. model_and_optim/__2_1.distcp +3 -0
  40. model_and_optim/__2_10.distcp +3 -0
  41. model_and_optim/__2_11.distcp +3 -0
  42. model_and_optim/__2_12.distcp +3 -0
  43. model_and_optim/__2_13.distcp +3 -0
  44. model_and_optim/__2_14.distcp +3 -0
  45. model_and_optim/__2_15.distcp +3 -0
  46. model_and_optim/__2_2.distcp +3 -0
  47. model_and_optim/__2_3.distcp +3 -0
  48. model_and_optim/__2_4.distcp +3 -0
  49. model_and_optim/__2_5.distcp +3 -0
  50. model_and_optim/__2_6.distcp +3 -0
.gitattributes CHANGED
@@ -33,3 +33,132 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model_and_optim/.metadata filter=lfs diff=lfs merge=lfs -text
37
+ model_and_optim/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ model_and_optim/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
39
+ model_and_optim/__0_10.distcp filter=lfs diff=lfs merge=lfs -text
40
+ model_and_optim/__0_11.distcp filter=lfs diff=lfs merge=lfs -text
41
+ model_and_optim/__0_12.distcp filter=lfs diff=lfs merge=lfs -text
42
+ model_and_optim/__0_13.distcp filter=lfs diff=lfs merge=lfs -text
43
+ model_and_optim/__0_14.distcp filter=lfs diff=lfs merge=lfs -text
44
+ model_and_optim/__0_15.distcp filter=lfs diff=lfs merge=lfs -text
45
+ model_and_optim/__0_2.distcp filter=lfs diff=lfs merge=lfs -text
46
+ model_and_optim/__0_3.distcp filter=lfs diff=lfs merge=lfs -text
47
+ model_and_optim/__0_4.distcp filter=lfs diff=lfs merge=lfs -text
48
+ model_and_optim/__0_5.distcp filter=lfs diff=lfs merge=lfs -text
49
+ model_and_optim/__0_6.distcp filter=lfs diff=lfs merge=lfs -text
50
+ model_and_optim/__0_7.distcp filter=lfs diff=lfs merge=lfs -text
51
+ model_and_optim/__0_8.distcp filter=lfs diff=lfs merge=lfs -text
52
+ model_and_optim/__0_9.distcp filter=lfs diff=lfs merge=lfs -text
53
+ model_and_optim/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
54
+ model_and_optim/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
55
+ model_and_optim/__1_10.distcp filter=lfs diff=lfs merge=lfs -text
56
+ model_and_optim/__1_11.distcp filter=lfs diff=lfs merge=lfs -text
57
+ model_and_optim/__1_12.distcp filter=lfs diff=lfs merge=lfs -text
58
+ model_and_optim/__1_13.distcp filter=lfs diff=lfs merge=lfs -text
59
+ model_and_optim/__1_14.distcp filter=lfs diff=lfs merge=lfs -text
60
+ model_and_optim/__1_15.distcp filter=lfs diff=lfs merge=lfs -text
61
+ model_and_optim/__1_2.distcp filter=lfs diff=lfs merge=lfs -text
62
+ model_and_optim/__1_3.distcp filter=lfs diff=lfs merge=lfs -text
63
+ model_and_optim/__1_4.distcp filter=lfs diff=lfs merge=lfs -text
64
+ model_and_optim/__1_5.distcp filter=lfs diff=lfs merge=lfs -text
65
+ model_and_optim/__1_6.distcp filter=lfs diff=lfs merge=lfs -text
66
+ model_and_optim/__1_7.distcp filter=lfs diff=lfs merge=lfs -text
67
+ model_and_optim/__1_8.distcp filter=lfs diff=lfs merge=lfs -text
68
+ model_and_optim/__1_9.distcp filter=lfs diff=lfs merge=lfs -text
69
+ model_and_optim/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
70
+ model_and_optim/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
71
+ model_and_optim/__2_10.distcp filter=lfs diff=lfs merge=lfs -text
72
+ model_and_optim/__2_11.distcp filter=lfs diff=lfs merge=lfs -text
73
+ model_and_optim/__2_12.distcp filter=lfs diff=lfs merge=lfs -text
74
+ model_and_optim/__2_13.distcp filter=lfs diff=lfs merge=lfs -text
75
+ model_and_optim/__2_14.distcp filter=lfs diff=lfs merge=lfs -text
76
+ model_and_optim/__2_15.distcp filter=lfs diff=lfs merge=lfs -text
77
+ model_and_optim/__2_2.distcp filter=lfs diff=lfs merge=lfs -text
78
+ model_and_optim/__2_3.distcp filter=lfs diff=lfs merge=lfs -text
79
+ model_and_optim/__2_4.distcp filter=lfs diff=lfs merge=lfs -text
80
+ model_and_optim/__2_5.distcp filter=lfs diff=lfs merge=lfs -text
81
+ model_and_optim/__2_6.distcp filter=lfs diff=lfs merge=lfs -text
82
+ model_and_optim/__2_7.distcp filter=lfs diff=lfs merge=lfs -text
83
+ model_and_optim/__2_8.distcp filter=lfs diff=lfs merge=lfs -text
84
+ model_and_optim/__2_9.distcp filter=lfs diff=lfs merge=lfs -text
85
+ model_and_optim/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
86
+ model_and_optim/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
87
+ model_and_optim/__3_10.distcp filter=lfs diff=lfs merge=lfs -text
88
+ model_and_optim/__3_11.distcp filter=lfs diff=lfs merge=lfs -text
89
+ model_and_optim/__3_12.distcp filter=lfs diff=lfs merge=lfs -text
90
+ model_and_optim/__3_13.distcp filter=lfs diff=lfs merge=lfs -text
91
+ model_and_optim/__3_14.distcp filter=lfs diff=lfs merge=lfs -text
92
+ model_and_optim/__3_15.distcp filter=lfs diff=lfs merge=lfs -text
93
+ model_and_optim/__3_2.distcp filter=lfs diff=lfs merge=lfs -text
94
+ model_and_optim/__3_3.distcp filter=lfs diff=lfs merge=lfs -text
95
+ model_and_optim/__3_4.distcp filter=lfs diff=lfs merge=lfs -text
96
+ model_and_optim/__3_5.distcp filter=lfs diff=lfs merge=lfs -text
97
+ model_and_optim/__3_6.distcp filter=lfs diff=lfs merge=lfs -text
98
+ model_and_optim/__3_7.distcp filter=lfs diff=lfs merge=lfs -text
99
+ model_and_optim/__3_8.distcp filter=lfs diff=lfs merge=lfs -text
100
+ model_and_optim/__3_9.distcp filter=lfs diff=lfs merge=lfs -text
101
+ model_and_optim/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
102
+ model_and_optim/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
103
+ model_and_optim/__4_10.distcp filter=lfs diff=lfs merge=lfs -text
104
+ model_and_optim/__4_11.distcp filter=lfs diff=lfs merge=lfs -text
105
+ model_and_optim/__4_12.distcp filter=lfs diff=lfs merge=lfs -text
106
+ model_and_optim/__4_13.distcp filter=lfs diff=lfs merge=lfs -text
107
+ model_and_optim/__4_14.distcp filter=lfs diff=lfs merge=lfs -text
108
+ model_and_optim/__4_15.distcp filter=lfs diff=lfs merge=lfs -text
109
+ model_and_optim/__4_2.distcp filter=lfs diff=lfs merge=lfs -text
110
+ model_and_optim/__4_3.distcp filter=lfs diff=lfs merge=lfs -text
111
+ model_and_optim/__4_4.distcp filter=lfs diff=lfs merge=lfs -text
112
+ model_and_optim/__4_5.distcp filter=lfs diff=lfs merge=lfs -text
113
+ model_and_optim/__4_6.distcp filter=lfs diff=lfs merge=lfs -text
114
+ model_and_optim/__4_7.distcp filter=lfs diff=lfs merge=lfs -text
115
+ model_and_optim/__4_8.distcp filter=lfs diff=lfs merge=lfs -text
116
+ model_and_optim/__4_9.distcp filter=lfs diff=lfs merge=lfs -text
117
+ model_and_optim/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
118
+ model_and_optim/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
119
+ model_and_optim/__5_10.distcp filter=lfs diff=lfs merge=lfs -text
120
+ model_and_optim/__5_11.distcp filter=lfs diff=lfs merge=lfs -text
121
+ model_and_optim/__5_12.distcp filter=lfs diff=lfs merge=lfs -text
122
+ model_and_optim/__5_13.distcp filter=lfs diff=lfs merge=lfs -text
123
+ model_and_optim/__5_14.distcp filter=lfs diff=lfs merge=lfs -text
124
+ model_and_optim/__5_15.distcp filter=lfs diff=lfs merge=lfs -text
125
+ model_and_optim/__5_2.distcp filter=lfs diff=lfs merge=lfs -text
126
+ model_and_optim/__5_3.distcp filter=lfs diff=lfs merge=lfs -text
127
+ model_and_optim/__5_4.distcp filter=lfs diff=lfs merge=lfs -text
128
+ model_and_optim/__5_5.distcp filter=lfs diff=lfs merge=lfs -text
129
+ model_and_optim/__5_6.distcp filter=lfs diff=lfs merge=lfs -text
130
+ model_and_optim/__5_7.distcp filter=lfs diff=lfs merge=lfs -text
131
+ model_and_optim/__5_8.distcp filter=lfs diff=lfs merge=lfs -text
132
+ model_and_optim/__5_9.distcp filter=lfs diff=lfs merge=lfs -text
133
+ model_and_optim/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
134
+ model_and_optim/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
135
+ model_and_optim/__6_10.distcp filter=lfs diff=lfs merge=lfs -text
136
+ model_and_optim/__6_11.distcp filter=lfs diff=lfs merge=lfs -text
137
+ model_and_optim/__6_12.distcp filter=lfs diff=lfs merge=lfs -text
138
+ model_and_optim/__6_13.distcp filter=lfs diff=lfs merge=lfs -text
139
+ model_and_optim/__6_14.distcp filter=lfs diff=lfs merge=lfs -text
140
+ model_and_optim/__6_15.distcp filter=lfs diff=lfs merge=lfs -text
141
+ model_and_optim/__6_2.distcp filter=lfs diff=lfs merge=lfs -text
142
+ model_and_optim/__6_3.distcp filter=lfs diff=lfs merge=lfs -text
143
+ model_and_optim/__6_4.distcp filter=lfs diff=lfs merge=lfs -text
144
+ model_and_optim/__6_5.distcp filter=lfs diff=lfs merge=lfs -text
145
+ model_and_optim/__6_6.distcp filter=lfs diff=lfs merge=lfs -text
146
+ model_and_optim/__6_7.distcp filter=lfs diff=lfs merge=lfs -text
147
+ model_and_optim/__6_8.distcp filter=lfs diff=lfs merge=lfs -text
148
+ model_and_optim/__6_9.distcp filter=lfs diff=lfs merge=lfs -text
149
+ model_and_optim/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
150
+ model_and_optim/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
151
+ model_and_optim/__7_10.distcp filter=lfs diff=lfs merge=lfs -text
152
+ model_and_optim/__7_11.distcp filter=lfs diff=lfs merge=lfs -text
153
+ model_and_optim/__7_12.distcp filter=lfs diff=lfs merge=lfs -text
154
+ model_and_optim/__7_13.distcp filter=lfs diff=lfs merge=lfs -text
155
+ model_and_optim/__7_14.distcp filter=lfs diff=lfs merge=lfs -text
156
+ model_and_optim/__7_15.distcp filter=lfs diff=lfs merge=lfs -text
157
+ model_and_optim/__7_2.distcp filter=lfs diff=lfs merge=lfs -text
158
+ model_and_optim/__7_3.distcp filter=lfs diff=lfs merge=lfs -text
159
+ model_and_optim/__7_4.distcp filter=lfs diff=lfs merge=lfs -text
160
+ model_and_optim/__7_5.distcp filter=lfs diff=lfs merge=lfs -text
161
+ model_and_optim/__7_6.distcp filter=lfs diff=lfs merge=lfs -text
162
+ model_and_optim/__7_7.distcp filter=lfs diff=lfs merge=lfs -text
163
+ model_and_optim/__7_8.distcp filter=lfs diff=lfs merge=lfs -text
164
+ model_and_optim/__7_9.distcp filter=lfs diff=lfs merge=lfs -text
.metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version": "2.0.0"}
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"run_name": "caia_olmo2_1b_d3", "launch": {"name": "caia_olmo2_1b_d3-train-95a8ba94", "cmd": ["src/scripts/train/OLMoE-2x1B-anneal.py", "train", "caia_olmo2_1b_d3", "ai2/jupiter-cirrascale-2", "--launch.num_nodes=1", "--launch.workspace=OLMo-modular", "--launch.priority=urgent", "--launch.beaker_image=petew/olmo-core-tch260cu124", "--trainer.callbacks.wandb.enabled=True", "--trainer.callbacks.comet.enabled=False", "--trainer.max_duration.value=200_000_000", "--trainer.max_duration.unit=tokens", "--dataset.mix_base_dir=/weka/oe-training-default/ai2-llm/preprocessed", "--dataset.mix=caia", "--trainer.load_path=/weka/oe-training-default/ai2-llm/checkpoints/swj0419/model/merge/olmo2_1b/moe2_random", "--train_module.dp_config.num_replicas=4", "--train_module.scheduler.warmup_steps=2000", "--train_module.optim.lr=4e-5", "--model.block.feed_forward_moe.router.top_k=2", "--train_module.rank_microbatch_size=4096", "--train_module.ep_config.degree=2"], "budget": "ai2/oe-base", "task_name": "train", "workspace": "OLMo-modular", "setup_steps": ["conda install gh --channel conda-forge", "gh repo clone \"$REPO_URL\" .", "git checkout \"$GIT_REF\"", "git submodule update --init --recursive", "conda shell.bash activate base", "pip install -e '.[dev,beaker,wandb,train]'", "pip freeze", "mkdir -p ~/.aws", "printenv AWS_CONFIG > ~/.aws/config", "printenv AWS_CREDENTIALS > ~/.aws/credentials"], "beaker_image": "petew/olmo-core-tch260cu124", "num_nodes": 1, "num_gpus": 8, "shared_memory": "10GiB", "clusters": ["ai2/jupiter-cirrascale-2"], "shared_filesystem": true, "priority": "urgent", "preemptible": true, "env_vars": [{"name": "NCCL_DEBUG", "value": "WARN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}, {"name": "CUDA_LAUNCH_BLOCKING", "value": "0", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}], "env_secrets": [{"name": "GITHUB_TOKEN", "secret": "weijias_GITHUB_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "BEAKER_TOKEN", "secret": "weijias_BEAKER_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WANDB_API_KEY", "secret": "weijias_WANDB_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "COMET_API_KEY", "secret": "weijias_COMET_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CONFIG", "secret": "weijias_AWS_CONFIG", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CREDENTIALS", "secret": "weijias_AWS_CREDENTIALS", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "R2_ENDPOINT_URL", "secret": "R2_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WEKA_ENDPOINT_URL", "secret": "WEKA_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "SLACK_WEBHOOK_URL", "secret": "SLACK_WEBHOOK_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}], "nfs": false, "weka_buckets": [{"bucket": "oe-training-default", "mount": "/weka/oe-training-default", "_CLASS_": "olmo_core.launch.beaker.BeakerWekaBucket"}], "allow_dirty": false, "_CLASS_": "olmo_core.launch.beaker.BeakerLaunchConfig"}, "model": {"d_model": 2048, "vocab_size": 100352, "n_layers": 16, "block": {"attention": {"name": "default", "n_heads": 16, "bias": false, "rope": {"name": "default", "theta": 500000, "full_precision": true, "_CLASS_": "olmo_core.nn.rope.RoPEConfig"}, "qk_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "use_flash": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.attention.AttentionConfig"}, "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "feed_forward_moe": {"name": "default", "num_experts": 2, "hidden_size": 8192, "capacity_factor": 1.2, "router": {"name": "default", "top_k": 2, "uniform_expert_assignment": false, "_CLASS_": "olmo_core.nn.moe.router.MoERouterConfig"}, "lb_loss_weight": 0.0, "z_loss_weight": 0.001, "dtype": "float32", "_CLASS_": "olmo_core.nn.moe.moe.MoEConfig"}, "name": "moe_reordered_norm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"}, "lm_head": {"name": "default", "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "bias": false, "dtype": "float32", "loss_implementation": "default", "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"}, "name": "moe", "dtype": "float32", "init_method": "normal", "init_seed": 0, "freeze_params": ["embeddings.*", "blocks.*.attention*", "blocks.*.feed_forward_norm.*", "lm_head.*"], "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"}, "dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "fsl", "sequence_length": 4096, "max_target_sequence_length": 8192, "mix": "caia", "mix_base_dir": "/weka/oe-training-default/ai2-llm", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/weijias/caia_olmo2_1b_d3/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "data_loader": {"global_batch_size": 4194304, "seed": 34521, "num_workers": 4, "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"}, "train_module": {"rank_microbatch_size": 4096, "max_sequence_length": 4096, "optim": {"compile": false, "fixed_fields": ["initial_lr"], "lr": 4e-05, "betas": [0.9, 0.95], "eps": 1e-08, "weight_decay": 0.0, "fused": true, "_CLASS_": "olmo_core.optim.adamw.AdamWConfig"}, "max_grad_norm": 1.0, "scheduler": {"lr_field": "lr", "initial_lr_field": "initial_lr", "warmup_steps": 2000, "alpha_f": 0.1, "warmup_min_lr": 0.0, "_CLASS_": "olmo_core.optim.scheduler.CosWithWarmup"}, "compile_model": true, "dp_config": {"name": "hsdp", "param_dtype": "bfloat16", "reduce_dtype": "float32", "num_replicas": 4, "wrapping_strategy": "fine_grained", "prefetch_factor": 0, "_CLASS_": "olmo_core.train.train_module.transformer.TransformerDataParallelConfig"}, "ep_config": {"degree": 2, "_CLASS_": "olmo_core.train.train_module.transformer.TransformerExpertParallelConfig"}, "z_loss_multiplier": 1e-05, "label_ignore_index": -100, "_CLASS_": "olmo_modular.train.train_module.transfomer.FreezeTransformerTrainModuleConfig"}, "trainer": {"save_folder": "/weka/oe-training-default/ai2-llm/checkpoints/weijias/caia_olmo2_1b_d3", "load_path": "/weka/oe-training-default/ai2-llm/checkpoints/swj0419/model/merge/olmo2_1b/moe2_random", "load_strategy": "if_available", "checkpointer": {"pre_download": false, "throttle_uploads": false, "_CLASS_": "olmo_core.train.checkpoint.CheckpointerConfig"}, "save_overwrite": true, "max_duration": {"value": 200000000, "unit": "tokens", "_CLASS_": "olmo_core.train.common.Duration"}, "cancel_check_interval": 1, "metrics_collect_interval": 10, "callbacks": {"downstream_evaluator": {"tasks": ["piqa"], "tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "eval_interval": 500, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": true, "_CLASS_": "olmo_modular.eval.evaluator_callback.DownstreamEvaluatorUpdatedCallbackConfig"}, "checkpointer": {"save_interval": 10000, "ephemeral_save_interval": 250, "save_async": true, "remove": "ephemeral_only", "enabled": true, "_CLASS_": "olmo_core.train.callbacks.checkpointer.CheckpointerCallback"}, "comet": {"enabled": false, "name": "caia_olmo2_1b_d3", "project": "OLMo-modular", "workspace": "ai2", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "notifications": "none", "failure_tag": "failed", "_CLASS_": "olmo_core.train.callbacks.comet.CometCallback"}, "wandb": {"enabled": true, "name": "caia_olmo2_1b_d3", "project": "OLMo-modular", "entity": "ai2-llm", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "_CLASS_": "olmo_core.train.callbacks.wandb.WandBCallback"}, "config_saver": {"fname": "config.json", "_CLASS_": "olmo_core.train.callbacks.config_saver.ConfigSaverCallback"}, "profiler": {"skip_first": 0, "wait": 1, "warmup": 5, "active": 3, "repeat": 1, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.profiler.ProfilerCallback"}, "garbage_collector": {"gc_interval": 1000, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.garbage_collector.GarbageCollectorCallback"}, "slack_notifier": {"name": "caia_olmo2_1b_d3", "notifications": "end_only", "enabled": false, "_CLASS_": "olmo_core.train.callbacks.slack_notifier.SlackNotifierCallback"}, "beaker": {"enabled": true, "_CLASS_": "olmo_core.train.callbacks.beaker.BeakerCallback"}, "gpu_monitor": {"_CLASS_": "olmo_core.train.callbacks.gpu_memory_monitor.GPUMemoryMonitorCallback"}, "lm_evaluator": {"eval_dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "padded_fsl", "sequence_length": 4096, "mix": "v3-small-ppl-validation", "mix_base_dir": "/weka/oe-training-default/ai2-llm", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/weijias/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.evaluator_callback.LMEvaluatorCallbackConfig"}}, "no_checkpoints": false, "no_evals": false, "_CLASS_": "olmo_core.train.config.TrainerConfig"}, "init_seed": 12536, "_CLASS_": "olmo_modular.internal.freeze_experiment.ExperimentConfig"}
data_paths.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /weka/oe-training-default/ai2-llm/caia_olmo2/combine_tokenized/part-0-00000.npy
2
+ /weka/oe-training-default/ai2-llm/caia_olmo2/combine_tokenized/part-1-00000.npy
model_and_optim/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c25f5639792ad24246cb78b1ba2920fa504ea59100266013bddaeb02692e5869
3
+ size 616662
model_and_optim/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e61e7bb41da152f4f948b5f82b5f92041ee4435e242c7eed7a3996a6b253c417
3
+ size 545314056
model_and_optim/__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a75c7c4aa1231c20ede8227af30b38a6fe86d2d3103f729520935745ad5da9a1
3
+ size 545314056
model_and_optim/__0_10.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af457da84aafb48dc8b4a798a1893334b446276549c31807d8f0dea7f78b31ca
3
+ size 184666668
model_and_optim/__0_11.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3946ea81f7c32e03f2c8597b052ac370a3b32005032a71fbee38a86accedb767
3
+ size 184666668
model_and_optim/__0_12.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0febd277fa5a17bcb0fa905d5857e7791b1cb73adb483371e2bbb2cd311957a6
3
+ size 184666668
model_and_optim/__0_13.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec449b3c76a0b6292152d7d3d450a9be541240bfbbac3e55bc599ffe47d44f8b
3
+ size 184666668
model_and_optim/__0_14.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fdcd263934ff2e9860dce95862a41df865989b914a3f05a449f2d9b3c774b81
3
+ size 235006576
model_and_optim/__0_15.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a84ae79e526b03fffe0b28886e07f38e1be925dd487a6ba03179461a00567da4
3
+ size 235006576
model_and_optim/__0_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a50244048048de753ab58f829e8b366ba5d8a8cbc0d535dbc430851c95d97f6
3
+ size 184660144
model_and_optim/__0_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38a3e7ae6e4015d4f157bdbe2babe2fa9c296a2e9a5970919994235734c23b5b
3
+ size 184660144
model_and_optim/__0_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7caa231a1072b75e5e34c631e1e160e7a77b8c7660c17b87edfeb5edb86d0ed
3
+ size 184660144
model_and_optim/__0_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20598cf5e65d5cfe49aa39708d0245a856e41ac18cfb2a1c95b655ffd3a78c52
3
+ size 184660144
model_and_optim/__0_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0a5e29880353df214007ef10ffd4146ac4a6f84117feaf22d169f7508dd80f5
3
+ size 184660144
model_and_optim/__0_7.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0acf275433ddda4c1dc73a8fbf3cb19b6a2c5e40093d27efad768532f3c8ddf3
3
+ size 184660144
model_and_optim/__0_8.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:246dcfaf8a4ac8c8209a04b7fb625157fc26cd9f89d2791b63841687e92a0151
3
+ size 184661324
model_and_optim/__0_9.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73ab491220f462d7171250d3ff9532ec073e7512941f66927ce4f18ee6df8330
3
+ size 184667848
model_and_optim/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:727a8e4eb5e0548618ba92c313fb5b16a35ebb20477516b5cbd8e7d7e7d7de10
3
+ size 545276040
model_and_optim/__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d910b74fba7d81ddd887b4ef43e430a69f236e7d9a40415b07484668ce5d39
3
+ size 545276040
model_and_optim/__1_10.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0f518e97d119a9c0bde3c96bcfef8d807e27830bcd3402c5454f4f5523c858
3
+ size 184618032
model_and_optim/__1_11.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc928c2e4ca56cdbcf80ac36da987fa719ae5da64760fb31c3ff135b8241b44b
3
+ size 184618032
model_and_optim/__1_12.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73f1f684e4e1c957c70ecfa7411ec25de96d85f2ac9d4a7945f696a8c4775c2e
3
+ size 184618032
model_and_optim/__1_13.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2244bc3d52fec82f8229ff9f45cd29c42c97eab40d94ae59a6041da518afa310
3
+ size 184618032
model_and_optim/__1_14.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3bc1fab016d42eb40630d23e3e3ba3b5b4948410f46445ec22294287c7f07e3
3
+ size 234957940
model_and_optim/__1_15.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d2e18d5bd5832c507789e8771ffff5b91e181efecd5c855c22a7868d96bad97
3
+ size 234957940
model_and_optim/__1_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e289e5dca974468e39067b6fc97e698ba9441cc500537d0fa9b77b01bfec86
3
+ size 184622128
model_and_optim/__1_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f26a272b661a76c8837848843031cd81bcfd17494dadf86d7ba5de5382c7ece
3
+ size 184622128
model_and_optim/__1_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:019559b5433399fd8108a958951bdd315492e3b956e5e4b2023bc05eb8bcb9d8
3
+ size 184622128
model_and_optim/__1_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:683a1a58a844675429ff1de10aaaa75a2c79b7e2c53198329d172aa5309a6cf2
3
+ size 184622128
model_and_optim/__1_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b6e0f15b468a7acfb5c0a06720d53997dfce700c1b91d0488d1cc43885a503
3
+ size 184622128
model_and_optim/__1_7.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf048ff61144716923c36c9a69d7abcbea3733c6e1cb500f9e89238d8f974ec
3
+ size 184622128
model_and_optim/__1_8.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e3bb846150594185392d790dfbdeb61d40011a304818aa9e7a610f933b3877b
3
+ size 184623308
model_and_optim/__1_9.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:595cdfc5038ec6b9754ebca4f704559680391ab60a409779a1d176504f4330f4
3
+ size 184618032
model_and_optim/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7328b4eb7fdc6050c7b37ab651d85f735d4315aa1b5f2a9c35ae7db9edb88f38
3
+ size 151005564
model_and_optim/__2_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e3f749f7b34918f8156c0c503cf23a71b7502bfc1d89d351d99a696948a0dc1
3
+ size 151005564
model_and_optim/__2_10.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d564b0fb22af7caf17399fe6a9a64f2ac19efdf2655af10f593c6dc066189b71
3
+ size 151005564
model_and_optim/__2_11.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d6b2ad31fd84ac2905aa293c33467f57ae7c531d6eb1de751c9e146d75d0b04
3
+ size 151005564
model_and_optim/__2_12.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:395aa7f87518b3fe6cf251917cc0001f0e4d83948f370ba7ec4198ccfff04d1f
3
+ size 151005564
model_and_optim/__2_13.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a740ef150903500ca40e37e57521dd1fad4b76b0601f51bda4ce4562e1c3dee2
3
+ size 151005564
model_and_optim/__2_14.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32444cc5580db282db3ca17eceaa46cff65a943d31396ebeaafbf47b7fa9c308
3
+ size 151005564
model_and_optim/__2_15.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1706d91f254a427d5e635aa9749eceb50ff41db3f27c4109e0e5cc48bb6386b6
3
+ size 151005564
model_and_optim/__2_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7250b55553443793b734bdcf443da683be2b8e42c00c9fe407e45c507c82f642
3
+ size 151005564
model_and_optim/__2_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44ad99fa2f99a8a6838c085c19b5bef7a3b723cd80c35ee764b09db7bb9ff17
3
+ size 151005564
model_and_optim/__2_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0614ca129c14258e6d73ecd31faf26fb4b6ecf8e4d1fbd0eed63a0c8f5e31962
3
+ size 151005564
model_and_optim/__2_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:406971458d744bfe10bed6068409a36c87242c2e30c64552dee52098739a7bd6
3
+ size 151005564
model_and_optim/__2_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90d551a440c35b04a61c20f4aac8983b567f86215ed7e05905dabfc35c74f599
3
+ size 151005564