diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..81f15561d0b6a1dd1adf616c0cfeaecd1423e5c2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +train.log filter=lfs diff=lfs merge=lfs -text +metrics.jsonl filter=lfs diff=lfs merge=lfs -text +checkpoints/0000057500/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000052500/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000052500/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000055000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000055000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000057500/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000057500/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000057500/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000055000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000052500/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000055000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoints/0000052500/__3_0.distcp filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/0000052500/.metadata b/checkpoints/0000052500/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..700d1bb27c026f4f3f6f05174bf73cb09d558f00 Binary files /dev/null and b/checkpoints/0000052500/.metadata differ diff --git a/checkpoints/0000052500/__0_0.distcp b/checkpoints/0000052500/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0e279cb0f4bfbd8cee4274a26c67a60f1d26b5d8 --- /dev/null +++ b/checkpoints/0000052500/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1059c3c7933e00bef22dad2c9196b61e9ed51625d8f9c4c98d40d41247220b +size 5089728720 diff --git a/checkpoints/0000052500/__1_0.distcp b/checkpoints/0000052500/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c1759aab1e75a50fd9d4813819badde79c317d56 --- /dev/null +++ b/checkpoints/0000052500/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36dfca6c235e04d8fe21b84b9ba645984353a652d5bc883a55f660dc472eea5 +size 5089821856 diff --git a/checkpoints/0000052500/__2_0.distcp b/checkpoints/0000052500/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..116059c1240d34848a1796a857ff42065a300b82 --- /dev/null +++ b/checkpoints/0000052500/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49014ebf3b1d6e4cfa12b7120216731bef8f392cea7eb04afe03368e32a3541c +size 5089821856 diff --git a/checkpoints/0000052500/__3_0.distcp b/checkpoints/0000052500/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..09756ca7347882f290b6c98546157cbf8362a29f --- /dev/null +++ b/checkpoints/0000052500/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4afe60b60d096c63ce7c24944d1fe7dd33e5a227b503258ab80a78243a517a9 +size 5089830112 diff --git a/checkpoints/0000052500/params.json b/checkpoints/0000052500/params.json new file mode 100644 index 0000000000000000000000000000000000000000..f3d8d680491042ff81bfe1b159499212c92582b8 --- /dev/null +++ b/checkpoints/0000052500/params.json @@ -0,0 +1 @@ +{"name": "large_lm", "dump_dir": "./dump_dir_llama1b2", "seed": 777, "grad_acc_steps": 2, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "batch_size": 8, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "rope_type": "original", "rope_inv_freq_learnable": false, "max_seqlen": 4096, "use_mla": "", "q_lora_rank": 1536, "kv_lora_rank": 512, "seed": 42, "vocab_size": 100512, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 4, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": 3}, "eval": {"every": 5000000000, "keep": -1}, "path": "dump_dir_llama1b2/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 1, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", "copa"]}, "validation": {"max_steps": 1000}, "generator": {"max_tokens": 16384, "dtype": "bf16"}}} \ No newline at end of file diff --git a/checkpoints/0000052500/train_state_00000.json b/checkpoints/0000052500/train_state_00000.json new file mode 100644 index 0000000000000000000000000000000000000000..c46dab051e75f056d5135fb94ac7986f8f7dc3d0 --- /dev/null +++ b/checkpoints/0000052500/train_state_00000.json @@ -0,0 +1 @@ +{"step": 52500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 20632936879, "block_size": 4, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 104764078563362074937563996671037632573, "inc": 11676600559890430755450356507027720041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 552, "rng_state": {"bit_generator": "PCG64", "state": {"state": 2395845221803597463567811736478288695, "inc": 77357518920597472829800677777012462921}, "has_uint32": 1, "uinteger": 1272976797}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 52500, "verbose": false, "_step_count": 52501, "_get_lr_called_within_step": false, "_last_lr": [0.00013555487141621536], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000052500/train_state_00001.json b/checkpoints/0000052500/train_state_00001.json new file mode 100644 index 0000000000000000000000000000000000000000..25f491a5276a078c13d54f30c5a20eef0cfbedfd --- /dev/null +++ b/checkpoints/0000052500/train_state_00001.json @@ -0,0 +1 @@ +{"step": 52500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 233, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 20814694346, "block_size": 4, "offset": 1, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 153609858426494565531725423363918940328, "inc": 239634081480473411747239400828488620799}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 552, "rng_state": {"bit_generator": "PCG64", "state": {"state": 129706445657155535128902585122205091844, "inc": 270234035871729269002159329014059236425}, "has_uint32": 1, "uinteger": 3215027928}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 52500, "verbose": false, "_step_count": 52501, "_get_lr_called_within_step": false, "_last_lr": [0.00013555487141621536], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000052500/train_state_00002.json b/checkpoints/0000052500/train_state_00002.json new file mode 100644 index 0000000000000000000000000000000000000000..118db8a4a8ad9e9dcda8c7c833694792e40b686c --- /dev/null +++ b/checkpoints/0000052500/train_state_00002.json @@ -0,0 +1 @@ +{"step": 52500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1876, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 20698029952, "block_size": 4, "offset": 2, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 331143196790987356343970786542895835071, "inc": 6027823433652931085739778990793808165}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 552, "rng_state": {"bit_generator": "PCG64", "state": {"state": 10307761932790020400304415503036386848, "inc": 188564971970541749319992297790591572713}, "has_uint32": 0, "uinteger": 2239832520}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 52500, "verbose": false, "_step_count": 52501, "_get_lr_called_within_step": false, "_last_lr": [0.00013555487141621536], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000052500/train_state_00003.json b/checkpoints/0000052500/train_state_00003.json new file mode 100644 index 0000000000000000000000000000000000000000..d9d1fd0389353477ebdc31e56e58a51021ed4372 --- /dev/null +++ b/checkpoints/0000052500/train_state_00003.json @@ -0,0 +1 @@ +{"step": 52500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1407, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 20664159395, "block_size": 4, "offset": 3, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 41541280331522767549157535813803619959, "inc": 92941856108932518968286621281627530405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 552, "rng_state": {"bit_generator": "PCG64", "state": {"state": 294838761640198597003972074272309035329, "inc": 66050176413739185524746886687120723265}, "has_uint32": 0, "uinteger": 2517394453}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 52500, "verbose": false, "_step_count": 52501, "_get_lr_called_within_step": false, "_last_lr": [0.00013555487141621536], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000055000/.metadata b/checkpoints/0000055000/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..7e7ff784791285713c8d8bd54a2cb2e089069202 Binary files /dev/null and b/checkpoints/0000055000/.metadata differ diff --git a/checkpoints/0000055000/__0_0.distcp b/checkpoints/0000055000/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..98ec805cfaad1f8d39361c6e024d560260f8e5f5 --- /dev/null +++ b/checkpoints/0000055000/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7d6edb3a8333652a8f0172196d8ac6945b699a77d26cf04434197182d283c6d +size 5089728720 diff --git a/checkpoints/0000055000/__1_0.distcp b/checkpoints/0000055000/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5fba945dbf1a7b14acbead4cc75b20a3308870ec --- /dev/null +++ b/checkpoints/0000055000/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61fedef57addb6b20eb0c1d28b7822a2b0a07da97c15f14f5e390d79ab31bb70 +size 5089821856 diff --git a/checkpoints/0000055000/__2_0.distcp b/checkpoints/0000055000/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4c43d5eedb575c9d05caf203b95ef37a1d3be8a6 --- /dev/null +++ b/checkpoints/0000055000/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40341bb007c8ed258605c36e918752570574def9780afcadb40087f543992db +size 5089821856 diff --git a/checkpoints/0000055000/__3_0.distcp b/checkpoints/0000055000/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9f32ecbfcdfb5a3b0c3b936fb6759ad4b04c56dc --- /dev/null +++ b/checkpoints/0000055000/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d606638e713a00934d28a6be17b11b337bc382b130e09f2683c03922319514d +size 5089830112 diff --git a/checkpoints/0000055000/params.json b/checkpoints/0000055000/params.json new file mode 100644 index 0000000000000000000000000000000000000000..f3d8d680491042ff81bfe1b159499212c92582b8 --- /dev/null +++ b/checkpoints/0000055000/params.json @@ -0,0 +1 @@ +{"name": "large_lm", "dump_dir": "./dump_dir_llama1b2", "seed": 777, "grad_acc_steps": 2, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "batch_size": 8, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "rope_type": "original", "rope_inv_freq_learnable": false, "max_seqlen": 4096, "use_mla": "", "q_lora_rank": 1536, "kv_lora_rank": 512, "seed": 42, "vocab_size": 100512, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 4, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": 3}, "eval": {"every": 5000000000, "keep": -1}, "path": "dump_dir_llama1b2/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 1, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", "copa"]}, "validation": {"max_steps": 1000}, "generator": {"max_tokens": 16384, "dtype": "bf16"}}} \ No newline at end of file diff --git a/checkpoints/0000055000/train_state_00000.json b/checkpoints/0000055000/train_state_00000.json new file mode 100644 index 0000000000000000000000000000000000000000..af0b9d9f762e27a06d358c597cf9960fed14a556 --- /dev/null +++ b/checkpoints/0000055000/train_state_00000.json @@ -0,0 +1 @@ +{"step": 55000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 171, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 24089526430, "block_size": 4, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 113480112311283759058948928503237735128, "inc": 11676600559890430755450356507027720041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 432, "rng_state": {"bit_generator": "PCG64", "state": {"state": 284361689960843992510826632036204406926, "inc": 77357518920597472829800677777012462921}, "has_uint32": 1, "uinteger": 1083475834}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 55000, "verbose": false, "_step_count": 55001, "_get_lr_called_within_step": false, "_last_lr": [6.0763478817714537e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000055000/train_state_00001.json b/checkpoints/0000055000/train_state_00001.json new file mode 100644 index 0000000000000000000000000000000000000000..7651e88aaa6797c8d6c8a9a58015a030227578b6 --- /dev/null +++ b/checkpoints/0000055000/train_state_00001.json @@ -0,0 +1 @@ +{"step": 55000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 629, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 24319442893, "block_size": 4, "offset": 1, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 336838252608328762642367373257392679475, "inc": 239634081480473411747239400828488620799}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 432, "rng_state": {"bit_generator": "PCG64", "state": {"state": 42938996891315000473233379793990467732, "inc": 270234035871729269002159329014059236425}, "has_uint32": 1, "uinteger": 3585551809}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 55000, "verbose": false, "_step_count": 55001, "_get_lr_called_within_step": false, "_last_lr": [6.0763478817714537e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000055000/train_state_00002.json b/checkpoints/0000055000/train_state_00002.json new file mode 100644 index 0000000000000000000000000000000000000000..aee886b2595b8b6b6cdb8f130852c41e795e0e8d --- /dev/null +++ b/checkpoints/0000055000/train_state_00002.json @@ -0,0 +1 @@ +{"step": 55000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 96, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 24174101596, "block_size": 4, "offset": 2, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 117467736827509381561014469368633930793, "inc": 6027823433652931085739778990793808165}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 432, "rng_state": {"bit_generator": "PCG64", "state": {"state": 266218648628129980865360739940674595406, "inc": 188564971970541749319992297790591572713}, "has_uint32": 0, "uinteger": 3020109911}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 55000, "verbose": false, "_step_count": 55001, "_get_lr_called_within_step": false, "_last_lr": [6.0763478817714537e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000055000/train_state_00003.json b/checkpoints/0000055000/train_state_00003.json new file mode 100644 index 0000000000000000000000000000000000000000..9422257abb8e4a699492168a74750926e7b5c20e --- /dev/null +++ b/checkpoints/0000055000/train_state_00003.json @@ -0,0 +1 @@ +{"step": 55000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 2506, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 24171240780, "block_size": 4, "offset": 3, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 144077674547792537725026078244309307985, "inc": 92941856108932518968286621281627530405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 432, "rng_state": {"bit_generator": "PCG64", "state": {"state": 288361091733615707256981067197297720635, "inc": 66050176413739185524746886687120723265}, "has_uint32": 0, "uinteger": 1686196970}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 55000, "verbose": false, "_step_count": 55001, "_get_lr_called_within_step": false, "_last_lr": [6.0763478817714537e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000057500/.metadata b/checkpoints/0000057500/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..44beb7b10030cc9abf41df941c162e1a3eb29d33 Binary files /dev/null and b/checkpoints/0000057500/.metadata differ diff --git a/checkpoints/0000057500/__0_0.distcp b/checkpoints/0000057500/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..19bb4ef211bd1f52f78d115248f4497b42ca27ed --- /dev/null +++ b/checkpoints/0000057500/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e863b6332f706742956f827ea6c07f0a394130df58522c8a462df09faf4713 +size 5089728720 diff --git a/checkpoints/0000057500/__1_0.distcp b/checkpoints/0000057500/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d22e4dcbfaea92672228716503dac350d62db0dd --- /dev/null +++ b/checkpoints/0000057500/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e3db8a7502ca8272668aadaf6970032f278244021d43d822b2e8b77784e761 +size 5089821856 diff --git a/checkpoints/0000057500/__2_0.distcp b/checkpoints/0000057500/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..189b051ccf1098c5dcad073427db8f674966f6d6 --- /dev/null +++ b/checkpoints/0000057500/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfe11383349fd9531badeb4d5d095826e6145949d4a65509f688496ea0c874a0 +size 5089821856 diff --git a/checkpoints/0000057500/__3_0.distcp b/checkpoints/0000057500/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..325f2cac9da7fe35b3e367e47b9a33e7e8305233 --- /dev/null +++ b/checkpoints/0000057500/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:401094cbd65964036806c50646e229fe446fac62c76b361a6c777721cf4b6b9a +size 5089830112 diff --git a/checkpoints/0000057500/params.json b/checkpoints/0000057500/params.json new file mode 100644 index 0000000000000000000000000000000000000000..f3d8d680491042ff81bfe1b159499212c92582b8 --- /dev/null +++ b/checkpoints/0000057500/params.json @@ -0,0 +1 @@ +{"name": "large_lm", "dump_dir": "./dump_dir_llama1b2", "seed": 777, "grad_acc_steps": 2, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "batch_size": 8, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "rope_type": "original", "rope_inv_freq_learnable": false, "max_seqlen": 4096, "use_mla": "", "q_lora_rank": 1536, "kv_lora_rank": 512, "seed": 42, "vocab_size": 100512, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 4, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": 3}, "eval": {"every": 5000000000, "keep": -1}, "path": "dump_dir_llama1b2/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 1, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", "copa"]}, "validation": {"max_steps": 1000}, "generator": {"max_tokens": 16384, "dtype": "bf16"}}} \ No newline at end of file diff --git a/checkpoints/0000057500/train_state_00000.json b/checkpoints/0000057500/train_state_00000.json new file mode 100644 index 0000000000000000000000000000000000000000..537fbbd68be41720c811500bbbdb9875db4759f3 --- /dev/null +++ b/checkpoints/0000057500/train_state_00000.json @@ -0,0 +1 @@ +{"step": 57500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 320, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27577514266, "block_size": 4, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 288376215711660853319606319848059944895, "inc": 11676600559890430755450356507027720041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 312, "rng_state": {"bit_generator": "PCG64", "state": {"state": 11493284078251286074307567083922618830, "inc": 77357518920597472829800677777012462921}, "has_uint32": 1, "uinteger": 1302367895}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 57500, "verbose": false, "_step_count": 57501, "_get_lr_called_within_step": false, "_last_lr": [1.5270821910763795e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000057500/train_state_00001.json b/checkpoints/0000057500/train_state_00001.json new file mode 100644 index 0000000000000000000000000000000000000000..0ff4afbfd4c459ff67debf27d740379977ed5c79 --- /dev/null +++ b/checkpoints/0000057500/train_state_00001.json @@ -0,0 +1 @@ +{"step": 57500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 983, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27815738732, "block_size": 4, "offset": 1, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 12028358019300960955490458574390826077, "inc": 239634081480473411747239400828488620799}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 312, "rng_state": {"bit_generator": "PCG64", "state": {"state": 187150612758938462197782101941292227468, "inc": 270234035871729269002159329014059236425}, "has_uint32": 0, "uinteger": 741317957}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 57500, "verbose": false, "_step_count": 57501, "_get_lr_called_within_step": false, "_last_lr": [1.5270821910763795e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000057500/train_state_00002.json b/checkpoints/0000057500/train_state_00002.json new file mode 100644 index 0000000000000000000000000000000000000000..77eb2d64aefa730db746e0893567b1ec4a38aeae --- /dev/null +++ b/checkpoints/0000057500/train_state_00002.json @@ -0,0 +1 @@ +{"step": 57500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 317, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27648204381, "block_size": 4, "offset": 2, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 81244764968533379643081700530451726139, "inc": 6027823433652931085739778990793808165}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 312, "rng_state": {"bit_generator": "PCG64", "state": {"state": 13311883204234167281050828487226707073, "inc": 188564971970541749319992297790591572713}, "has_uint32": 0, "uinteger": 1212867564}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 57500, "verbose": false, "_step_count": 57501, "_get_lr_called_within_step": false, "_last_lr": [1.5270821910763795e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/checkpoints/0000057500/train_state_00003.json b/checkpoints/0000057500/train_state_00003.json new file mode 100644 index 0000000000000000000000000000000000000000..da24c7434d55f9baff07a992aaa0c9e54d9b0238 --- /dev/null +++ b/checkpoints/0000057500/train_state_00003.json @@ -0,0 +1 @@ +{"step": 57500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 752, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27639381247, "block_size": 4, "offset": 3, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 286741500864262708623674448239672053530, "inc": 92941856108932518968286621281627530405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 312, "rng_state": {"bit_generator": "PCG64", "state": {"state": 240889267456687567804847541390924151647, "inc": 66050176413739185524746886687120723265}, "has_uint32": 1, "uinteger": 2653545164}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 57500, "verbose": false, "_step_count": 57501, "_get_lr_called_within_step": false, "_last_lr": [1.5270821910763795e-05], "lr_lambdas": [{}]}} \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c3bfab72c4715f85af09af8c60e53b21495f8e1 --- /dev/null +++ b/config.yaml @@ -0,0 +1,127 @@ +name: large_lm +dump_dir: ./dump_dir_llama1b2 +seed: 777 +grad_acc_steps: 2 +gc_collect_freq: 1000 +probe_freq: null +steps: 60000 +data: + root_dir: ./data + sources: + fineweb_edu_10bt_shuffled: 100.0 + batch_size: 8 + seq_len: 4096 + n_views: 2 + seed: 42 + add_bos: true + add_eos: true + load_async: true + prefetch_size: 1024 + tokenizer: + name: tiktoken + path: tokenizers/cl100k_base.tiktoken +optim: + lr: 0.003 + weight_decay: 0.033 + epsilon: 1.0e-08 + beta1: 0.9 + beta2: 0.95 + clip: 1.0 + scheduler: cosine + warmup: 5000 + lr_min_ratio: 1.0e-06 + cycle_length: 1.0 + cosine_theta: 1.0 + annealing_step: 1000 + decay_fraction: 0.1 + exp_factor: 0.5 +model: + dim: 2048 + n_layers: 25 + head_dim: null + n_heads: 16 + n_kv_heads: null + ffn_dim_multiplier: null + multiple_of: 256 + norm_eps: 1.0e-05 + rope_theta: 10000.0 + init_base_std: null + init_std_factor: disabled + rope_type: original + rope_inv_freq_learnable: false + max_seqlen: 4096 + use_mla: '' + q_lora_rank: 1536 + kv_lora_rank: 512 + seed: 42 + vocab_size: 100512 + weight_tying: false + sliding_window: null +distributed: + dp_shard: 1 + dp_replicate: 4 + tp_size: 1 + selective_activation_checkpointing: false + compile: true + fsdp_type: full_shard + model_dtype: bf16 + float8_recipe: null + float8_filter: layers\.[0-9]+\. + matmul_allow_tf32: true + detect_anomaly: false + compile_cache_size_limit: 8 + spawn_method: forkserver +env: + MKL_SERVICE_FORCE_INTEL: GNU + OMP_NUM_THREADS: '1' + MKL_NUM_THREADS: '1' + ENABLE_INTRA_NODE_COMM: '1' + TORCH_NCCL_AVOID_RECORD_STREAMS: '1' + NCCL_IB_TIMEOUT: '22' + NCCL_DEBUG: INFO + TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' +checkpoint: + dump: + every: 2500 + keep: 3 + eval: + every: 5000000000 + keep: -1 + path: dump_dir_llama1b2/checkpoints + init_ckpt_path: null + continue_training_from_init: false +profiling: + run: true + trace_folder: profiling + mem_warmup: 0 + mem_steps: 4 + profile_warmup: 100 + profile_steps: 4 +logging: + freq: 1 + acc_freq: null + wandb: null +async_eval_gpus: 1 +eval: + harness: + tasks: + - hellaswag + - task: boolq + dataset_kwargs: + trust_remote_code: true + - piqa + - task: social_iqa + dataset_kwargs: + trust_remote_code: true + - winogrande + - openbookqa + - arc_easy + - arc_challenge + - race + - commonsense_qa + - copa + validation: + max_steps: 1000 + generator: + max_tokens: 16384 + dtype: bf16 diff --git a/metrics.jsonl b/metrics.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7446d99bfe49d2c4bc9cff46e3b29170ab11a0ac --- /dev/null +++ b/metrics.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60ccbed3e27ce72b330634d0bfa6a212eff253136067d0beb8b26cb1934673f1 +size 35436667 diff --git a/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104882.html b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104882.html new file mode 100644 index 0000000000000000000000000000000000000000..353c1b1ec3400d846d0011088e3b8c522561dcc3 --- /dev/null +++ b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104882.html @@ -0,0 +1,12 @@ + + + + + + + + diff --git a/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104883.html b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104883.html new file mode 100644 index 0000000000000000000000000000000000000000..4337834b572c60eb6c759f1e552da69060f43469 --- /dev/null +++ b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104883.html @@ -0,0 +1,12 @@ + + + + + + + + diff --git a/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104884.html b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104884.html new file mode 100644 index 0000000000000000000000000000000000000000..f42e599a0ad568d8bdb58f7e9ea8cd650763f83e --- /dev/null +++ b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104884.html @@ -0,0 +1,12 @@ + + + + + + + + diff --git a/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104885.html b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104885.html new file mode 100644 index 0000000000000000000000000000000000000000..206d797318a97a9465d6330be394c6d9204243ff --- /dev/null +++ b/profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104885.html @@ -0,0 +1,12 @@ + + + + + + + + diff --git a/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104882.1736675622108438789.pt.trace.json.gz b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104882.1736675622108438789.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..81c869fbb7ba3a854f008fc95cfca2a6f57726af --- /dev/null +++ b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104882.1736675622108438789.pt.trace.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e194fb23d0de28b2144d65b59b6909603576adff80b2c65fd525dbb50a6c8172 +size 1926496 diff --git a/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104883.1736675622100620970.pt.trace.json.gz b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104883.1736675622100620970.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..19b123bb776fa8c1d5c722db852745cf27a1d145 --- /dev/null +++ b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104883.1736675622100620970.pt.trace.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd90924612b2a2b89d112772902d8442c4dd6f487a3b06e2ddc89472c2e0f63 +size 1933461 diff --git a/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104884.1736675622099254891.pt.trace.json.gz b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104884.1736675622099254891.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..e5b507af6213c8d76e4d602d552cb2f5e4512195 --- /dev/null +++ b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104884.1736675622099254891.pt.trace.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c975ec2b8575f74c6297a1fc762fd9ee8c8938becedae0148f5766b8e059cd6 +size 1932617 diff --git a/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104885.1736675622113109902.pt.trace.json.gz b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104885.1736675622113109902.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..d131ab40e45b71c443062a491c21b5f9e08e885a --- /dev/null +++ b/profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_104885.1736675622113109902.pt.trace.json.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc95d196acb821e88127f671380d311037f68facc50ef41291d148d128493a6 +size 1933550 diff --git a/train.log b/train.log new file mode 100644 index 0000000000000000000000000000000000000000..8f6a114033ba888210d7f5230d0d2564ba6d13c4 --- /dev/null +++ b/train.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc27de864c79cc208f7490628012f3e8350a66828e1177ff724ecc241c0f35c3 +size 12290594