swj0419 commited on Sep 25

Commit

3ec3d33

verified ·

1 Parent(s): 4d0d59f

Upload OLMo-2 model checkpoint

Browse files

Files changed (20) hide show

.gitattributes +16 -0
config.json +1 -0
config.json~ +1 -0
model_and_optim/.metadata +0 -0
model_and_optim/__0_0.distcp +3 -0
model_and_optim/__0_1.distcp +3 -0
model_and_optim/__0_10.distcp +3 -0
model_and_optim/__0_11.distcp +3 -0
model_and_optim/__0_12.distcp +3 -0
model_and_optim/__0_13.distcp +3 -0
model_and_optim/__0_14.distcp +3 -0
model_and_optim/__0_15.distcp +3 -0
model_and_optim/__0_2.distcp +3 -0
model_and_optim/__0_3.distcp +3 -0
model_and_optim/__0_4.distcp +3 -0
model_and_optim/__0_5.distcp +3 -0
model_and_optim/__0_6.distcp +3 -0
model_and_optim/__0_7.distcp +3 -0
model_and_optim/__0_8.distcp +3 -0
model_and_optim/__0_9.distcp +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_10.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_11.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_12.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_13.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_14.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_15.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_2.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_3.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_4.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_5.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_6.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_7.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_8.distcp filter=lfs diff=lfs merge=lfs -text
+model_and_optim/__0_9.distcp filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"run_name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "launch": {"name": "OLMo2-7B-anneal-from-stage1-no-math-flan-train-3ce51db1", "cmd": ["src/scripts/train/OLMo2-7B-linear-decay.py", "train", "OLMo2-7B-anneal-from-stage1-no-math-flan", "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-stage1-step928646", "ai2/jupiter-cirrascale-2", "--launch.num_nodes=8", "--launch.workspace=OLMo-modular", "--launch.beaker_image=petew/olmo-core-tch260cu124", "--launch.priority=high", "--trainer.callbacks.wandb.enabled=True", "--trainer.callbacks.comet.enabled=False", "--trainer.callbacks.lm_evaluator.enabled=False", "--trainer.callbacks.downstream_evaluator.enabled=True", "--trainer.max_duration.value=50000000000", "--trainer.max_duration.unit=tokens", "--dataset.mix=dolmino_minus_math_flan", "--dataset.mix_base_dir=/weka/oe-training-default/ai2-llm/", "--train_module.float8_config.enabled=true", "--train_module.optim.lr=0.000061499", "--train_module.scheduler.warmup_steps=0"], "budget": "ai2/oe-training", "task_name": "train", "workspace": "OLMo-modular", "setup_steps": ["conda install gh --channel conda-forge", "gh repo clone \"$REPO_URL\" .", "git checkout \"$GIT_REF\"", "git submodule update --init --recursive", "conda shell.bash activate base", "pip install -e '.[dev,beaker,wandb,train]'", "pip freeze", "mkdir -p ~/.aws", "printenv AWS_CONFIG > ~/.aws/config", "printenv AWS_CREDENTIALS > ~/.aws/credentials"], "beaker_image": "petew/olmo-core-tch260cu124", "num_nodes": 8, "num_gpus": 8, "shared_memory": "10GiB", "clusters": ["ai2/jupiter-cirrascale-2"], "shared_filesystem": true, "priority": "high", "preemptible": true, "env_vars": [{"name": "NCCL_DEBUG", "value": "WARN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}, {"name": "CUDA_LAUNCH_BLOCKING", "value": "0", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}], "env_secrets": [{"name": "GITHUB_TOKEN", "secret": "akshitab_GITHUB_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "BEAKER_TOKEN", "secret": "akshitab_BEAKER_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WANDB_API_KEY", "secret": "akshitab_WANDB_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "COMET_API_KEY", "secret": "akshitab_COMET_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CONFIG", "secret": "akshitab_AWS_CONFIG", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CREDENTIALS", "secret": "akshitab_AWS_CREDENTIALS", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "R2_ENDPOINT_URL", "secret": "R2_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WEKA_ENDPOINT_URL", "secret": "WEKA_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "SLACK_WEBHOOK_URL", "secret": "SLACK_WEBHOOK_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}], "nfs": false, "weka_buckets": [{"bucket": "oe-training-default", "mount": "/weka/oe-training-default", "_CLASS_": "olmo_core.launch.beaker.BeakerWekaBucket"}], "allow_dirty": false, "_CLASS_": "olmo_core.launch.beaker.BeakerLaunchConfig"}, "model": {"d_model": 2048, "vocab_size": 100352, "n_layers": 16, "block": {"attention": {"name": "default", "n_heads": 16, "bias": false, "rope": {"name": "default", "theta": 500000, "full_precision": true, "_CLASS_": "olmo_core.nn.rope.RoPEConfig"}, "qk_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "use_flash": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.attention.AttentionConfig"}, "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "feed_forward": {"hidden_size": 8192, "name": "default", "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"}, "name": "reordered_norm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"}, "lm_head": {"name": "default", "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "bias": false, "dtype": "float32", "loss_implementation": "default", "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"}, "name": "default", "dtype": "float32", "init_method": "normal", "init_seed": 0, "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"}, "dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "fsl", "sequence_length": 4096, "max_target_sequence_length": 8192, "mix": "dolmino_minus_math_flan", "mix_base_dir": "/weka/oe-training-default/ai2-llm/", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-anneal-from-stage1-no-math-flan/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "data_loader": {"global_batch_size": 4194304, "seed": 34521, "num_workers": 4, "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"}, "train_module": {"rank_microbatch_size": 8192, "max_sequence_length": 4096, "optim": {"group_overrides": [{"params": ["embeddings.weight"], "opts": {"weight_decay": 0.0}, "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"}], "compile": false, "fixed_fields": ["initial_lr"], "lr": 6.1499e-05, "betas": [0.9, 0.95], "eps": 1e-08, "weight_decay": 0.1, "fused": true, "_CLASS_": "olmo_core.optim.adamw.AdamWConfig"}, "max_grad_norm": 1.0, "scheduler": {"lr_field": "lr", "initial_lr_field": "initial_lr", "alpha_f": 0.1, "warmup_steps": 0, "warmup_min_lr": 0.0, "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"}, "compile_model": true, "float8_config": {"ao": {"enable_fsdp_float8_all_gather": true, "force_recompute_fp8_weight_in_bwd": true, "round_scales_to_power_of_2": true, "_CLASS_": "olmo_core.float8.ao.AOFloat8LinearConfig"}, "enabled": true, "_CLASS_": "olmo_core.float8.Float8Config"}, "dp_config": {"name": "hsdp", "param_dtype": "bfloat16", "reduce_dtype": "float32", "wrapping_strategy": "blocks", "prefetch_factor": 0, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"}, "z_loss_multiplier": 1e-05, "state_dict_save_opts": {"flatten_optimizer_state_dict": true}, "label_ignore_index": -100, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"}, "trainer": {"save_folder": "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-anneal-from-stage1-no-math-flan", "load_strategy": "if_available", "checkpointer": {"pre_download": false, "throttle_uploads": false, "_CLASS_": "olmo_core.train.checkpoint.CheckpointerConfig"}, "save_overwrite": true, "max_duration": {"value": 50000000000, "unit": "tokens", "_CLASS_": "olmo_core.train.common.Duration"}, "cancel_check_interval": 1, "metrics_collect_interval": 10, "callbacks": {"downstream_evaluator": {"tasks": ["piqa", "hellaswag", "winogrande", "openbook_qa", "boolq", "sciq", "xsum", "wildbench_math", "wildbench_reasoning", "wildbench_coding_debugging", "wildbench_creative_writing", "mmlu_stem_val_rc_5shot", "mmlu_humanities_val_rc_5shot", "mmlu_social_sciences_val_rc_5shot", "mmlu_other_val_rc_5shot"], "tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": true, "_CLASS_": "olmo_modular.eval.evaluator_callback.DownstreamEvaluatorUpdatedCallbackConfig"}, "checkpointer": {"save_interval": 10000, "ephemeral_save_interval": 250, "save_async": true, "remove": "ephemeral_only", "enabled": true, "_CLASS_": "olmo_core.train.callbacks.checkpointer.CheckpointerCallback"}, "comet": {"enabled": false, "name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "project": "OLMo-modular", "workspace": "ai2", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "notifications": "none", "failure_tag": "failed", "auto_resume": false, "_CLASS_": "olmo_core.train.callbacks.comet.CometCallback"}, "wandb": {"enabled": true, "name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "project": "OLMo-modular", "entity": "ai2-llm", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "_CLASS_": "olmo_core.train.callbacks.wandb.WandBCallback"}, "config_saver": {"fname": "config.json", "_CLASS_": "olmo_core.train.callbacks.config_saver.ConfigSaverCallback"}, "profiler": {"skip_first": 0, "wait": 1, "warmup": 5, "active": 3, "repeat": 1, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.profiler.ProfilerCallback"}, "garbage_collector": {"gc_interval": 1000, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.garbage_collector.GarbageCollectorCallback"}, "slack_notifier": {"name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "notifications": "end_only", "enabled": false, "_CLASS_": "olmo_core.train.callbacks.slack_notifier.SlackNotifierCallback"}, "beaker": {"enabled": true, "_CLASS_": "olmo_core.train.callbacks.beaker.BeakerCallback"}, "gpu_monitor": {"_CLASS_": "olmo_core.train.callbacks.gpu_memory_monitor.GPUMemoryMonitorCallback"}, "lm_evaluator": {"eval_dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "padded_fsl", "sequence_length": 4096, "mix": "v3-small-ppl-validation", "mix_base_dir": "/weka/oe-training-default/ai2-llm", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.evaluator_callback.LMEvaluatorCallbackConfig"}}, "no_checkpoints": false, "no_evals": false, "_CLASS_": "olmo_core.train.config.TrainerConfig"}, "init_seed": 12536, "_CLASS_": "olmo_modular.internal.experiment.ExperimentConfig"}

config.json~ ADDED Viewed

	@@ -0,0 +1 @@

+ {"run_name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "launch": {"name": "OLMo2-7B-anneal-from-stage1-no-math-flan-train-3ce51db1", "cmd": ["src/scripts/train/OLMo2-7B-linear-decay.py", "train", "OLMo2-7B-anneal-from-stage1-no-math-flan", "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-stage1-step928646", "ai2/jupiter-cirrascale-2", "--launch.num_nodes=8", "--launch.workspace=OLMo-modular", "--launch.beaker_image=petew/olmo-core-tch260cu124", "--launch.priority=high", "--trainer.callbacks.wandb.enabled=True", "--trainer.callbacks.comet.enabled=False", "--trainer.callbacks.lm_evaluator.enabled=False", "--trainer.callbacks.downstream_evaluator.enabled=True", "--trainer.max_duration.value=50000000000", "--trainer.max_duration.unit=tokens", "--dataset.mix=dolmino_minus_math_flan", "--dataset.mix_base_dir=/weka/oe-training-default/ai2-llm/", "--train_module.float8_config.enabled=true", "--train_module.optim.lr=0.000061499", "--train_module.scheduler.warmup_steps=0"], "budget": "ai2/oe-training", "task_name": "train", "workspace": "OLMo-modular", "setup_steps": ["conda install gh --channel conda-forge", "gh repo clone \"$REPO_URL\" .", "git checkout \"$GIT_REF\"", "git submodule update --init --recursive", "conda shell.bash activate base", "pip install -e '.[dev,beaker,wandb,train]'", "pip freeze", "mkdir -p ~/.aws", "printenv AWS_CONFIG > ~/.aws/config", "printenv AWS_CREDENTIALS > ~/.aws/credentials"], "beaker_image": "petew/olmo-core-tch260cu124", "num_nodes": 8, "num_gpus": 8, "shared_memory": "10GiB", "clusters": ["ai2/jupiter-cirrascale-2"], "shared_filesystem": true, "priority": "high", "preemptible": true, "env_vars": [{"name": "NCCL_DEBUG", "value": "WARN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}, {"name": "CUDA_LAUNCH_BLOCKING", "value": "0", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}], "env_secrets": [{"name": "GITHUB_TOKEN", "secret": "akshitab_GITHUB_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "BEAKER_TOKEN", "secret": "akshitab_BEAKER_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WANDB_API_KEY", "secret": "akshitab_WANDB_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "COMET_API_KEY", "secret": "akshitab_COMET_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CONFIG", "secret": "akshitab_AWS_CONFIG", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CREDENTIALS", "secret": "akshitab_AWS_CREDENTIALS", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "R2_ENDPOINT_URL", "secret": "R2_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WEKA_ENDPOINT_URL", "secret": "WEKA_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "SLACK_WEBHOOK_URL", "secret": "SLACK_WEBHOOK_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}], "nfs": false, "weka_buckets": [{"bucket": "oe-training-default", "mount": "/weka/oe-training-default", "_CLASS_": "olmo_core.launch.beaker.BeakerWekaBucket"}], "allow_dirty": false, "_CLASS_": "olmo_core.launch.beaker.BeakerLaunchConfig"}, "model": {"d_model": 8192, "vocab_size": 100352, "n_layers": 16, "block": {"attention": {"name": "default", "n_heads": 16, "bias": false, "rope": {"name": "default", "theta": 500000, "full_precision": true, "_CLASS_": "olmo_core.nn.rope.RoPEConfig"}, "qk_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "use_flash": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.attention.AttentionConfig"}, "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "feed_forward": {"hidden_size": 2048, "name": "default", "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"}, "name": "reordered_norm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"}, "lm_head": {"name": "default", "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "bias": false, "dtype": "float32", "loss_implementation": "default", "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"}, "name": "default", "dtype": "float32", "init_method": "normal", "init_seed": 0, "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"}, "dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "fsl", "sequence_length": 4096, "max_target_sequence_length": 8192, "mix": "dolmino_minus_math_flan", "mix_base_dir": "/weka/oe-training-default/ai2-llm/", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-anneal-from-stage1-no-math-flan/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "data_loader": {"global_batch_size": 4194304, "seed": 34521, "num_workers": 4, "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"}, "train_module": {"rank_microbatch_size": 8192, "max_sequence_length": 4096, "optim": {"group_overrides": [{"params": ["embeddings.weight"], "opts": {"weight_decay": 0.0}, "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"}], "compile": false, "fixed_fields": ["initial_lr"], "lr": 6.1499e-05, "betas": [0.9, 0.95], "eps": 1e-08, "weight_decay": 0.1, "fused": true, "_CLASS_": "olmo_core.optim.adamw.AdamWConfig"}, "max_grad_norm": 1.0, "scheduler": {"lr_field": "lr", "initial_lr_field": "initial_lr", "alpha_f": 0.1, "warmup_steps": 0, "warmup_min_lr": 0.0, "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"}, "compile_model": true, "float8_config": {"ao": {"enable_fsdp_float8_all_gather": true, "force_recompute_fp8_weight_in_bwd": true, "round_scales_to_power_of_2": true, "_CLASS_": "olmo_core.float8.ao.AOFloat8LinearConfig"}, "enabled": true, "_CLASS_": "olmo_core.float8.Float8Config"}, "dp_config": {"name": "hsdp", "param_dtype": "bfloat16", "reduce_dtype": "float32", "wrapping_strategy": "blocks", "prefetch_factor": 0, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"}, "z_loss_multiplier": 1e-05, "state_dict_save_opts": {"flatten_optimizer_state_dict": true}, "label_ignore_index": -100, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"}, "trainer": {"save_folder": "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-anneal-from-stage1-no-math-flan", "load_strategy": "if_available", "checkpointer": {"pre_download": false, "throttle_uploads": false, "_CLASS_": "olmo_core.train.checkpoint.CheckpointerConfig"}, "save_overwrite": true, "max_duration": {"value": 50000000000, "unit": "tokens", "_CLASS_": "olmo_core.train.common.Duration"}, "cancel_check_interval": 1, "metrics_collect_interval": 10, "callbacks": {"downstream_evaluator": {"tasks": ["piqa", "hellaswag", "winogrande", "openbook_qa", "boolq", "sciq", "xsum", "wildbench_math", "wildbench_reasoning", "wildbench_coding_debugging", "wildbench_creative_writing", "mmlu_stem_val_rc_5shot", "mmlu_humanities_val_rc_5shot", "mmlu_social_sciences_val_rc_5shot", "mmlu_other_val_rc_5shot"], "tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": true, "_CLASS_": "olmo_modular.eval.evaluator_callback.DownstreamEvaluatorUpdatedCallbackConfig"}, "checkpointer": {"save_interval": 10000, "ephemeral_save_interval": 250, "save_async": true, "remove": "ephemeral_only", "enabled": true, "_CLASS_": "olmo_core.train.callbacks.checkpointer.CheckpointerCallback"}, "comet": {"enabled": false, "name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "project": "OLMo-modular", "workspace": "ai2", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "notifications": "none", "failure_tag": "failed", "auto_resume": false, "_CLASS_": "olmo_core.train.callbacks.comet.CometCallback"}, "wandb": {"enabled": true, "name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "project": "OLMo-modular", "entity": "ai2-llm", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "_CLASS_": "olmo_core.train.callbacks.wandb.WandBCallback"}, "config_saver": {"fname": "config.json", "_CLASS_": "olmo_core.train.callbacks.config_saver.ConfigSaverCallback"}, "profiler": {"skip_first": 0, "wait": 1, "warmup": 5, "active": 3, "repeat": 1, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.profiler.ProfilerCallback"}, "garbage_collector": {"gc_interval": 1000, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.garbage_collector.GarbageCollectorCallback"}, "slack_notifier": {"name": "OLMo2-7B-anneal-from-stage1-no-math-flan", "notifications": "end_only", "enabled": false, "_CLASS_": "olmo_core.train.callbacks.slack_notifier.SlackNotifierCallback"}, "beaker": {"enabled": true, "_CLASS_": "olmo_core.train.callbacks.beaker.BeakerCallback"}, "gpu_monitor": {"_CLASS_": "olmo_core.train.callbacks.gpu_memory_monitor.GPUMemoryMonitorCallback"}, "lm_evaluator": {"eval_dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "padded_fsl", "sequence_length": 4096, "mix": "v3-small-ppl-validation", "mix_base_dir": "/weka/oe-training-default/ai2-llm", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.evaluator_callback.LMEvaluatorCallbackConfig"}}, "no_checkpoints": false, "no_evals": false, "_CLASS_": "olmo_core.train.config.TrainerConfig"}, "init_seed": 12536, "_CLASS_": "olmo_modular.internal.experiment.ExperimentConfig"}

model_and_optim/.metadata ADDED Viewed

Binary file (53.3 kB). View file

model_and_optim/__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44b326bd416f80165e0de2894125027fc5f86cd89457a563ba2f82c987720067
+size 822084764

model_and_optim/__0_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a6332ca9ab37c5397a85bd02fbd4b0d46a1c0cecf7e6b32611e13cab755dded
+size 822084764

model_and_optim/__0_10.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:583e94872dda8422b6bf38ff472c7bc1272e07254fbcff1551e98ac076466514
+size 335675596

model_and_optim/__0_11.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cbefbd41cbab796d6d78726363719418a10126cbfe89d92c3a502f992671984
+size 335666224

model_and_optim/__0_12.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1c2ce6e7601656ba3bd9d60b87722060f8ac5ae750615ab1e8ea9168135aefe
+size 335666224

model_and_optim/__0_13.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:902261764fe7321fe12079ea53dc52cd8d20e65db9a038269176d0c6a59f748e
+size 335666224

model_and_optim/__0_14.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2079f3729b3c97eef43222353d6619f99368a35fe5466f46a989d7c99920f96d
+size 335669764

model_and_optim/__0_15.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22d00ad1f6844ab2b70274e4d5ac5fd34fcfb1d5bd9b29d8f35d45a2814f48d4
+size 335669764

model_and_optim/__0_2.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48bbd8e46a3b07ae0948a90464e8e69008d6c5719c754bd2c18b480d8d5c77f2
+size 352332156

model_and_optim/__0_3.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6b9da7a442a23f0ab2f7c813caecb2d653090aabbb8f2ec1526a498948ccd69
+size 352332156

model_and_optim/__0_4.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dabdef7330137b13ba68a88478abc15a237d5ee4ba32a1db26bb1c930af7b3c
+size 352332156

model_and_optim/__0_5.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fac105de17886feba3ba9db1fe63fc7854e300fc54ebece993b21eff0b17f33
+size 352332156

model_and_optim/__0_6.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad926a48fe2c30907efacad4e1301f7458fec77b57e5b31417c0267ea32febb
+size 352332156

model_and_optim/__0_7.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3410a31e0625979f1666b0945d4251e2c0275593250ecbd43ed0512ca5d67dde
+size 352332156

model_and_optim/__0_8.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba889408f36065ce574aedff11b4c10bc9f2be6316400331e1d570742d67dc89
+size 352332156

model_and_optim/__0_9.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14b646f91ab06741f98a55c45e94b334662eca7410d13fc3d36bd4f0c104f229
+size 352332156