diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-15260/config.json b/checkpoint-15260/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09 --- /dev/null +++ b/checkpoint-15260/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.47.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-15260/generation_config.json b/checkpoint-15260/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507 --- /dev/null +++ b/checkpoint-15260/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.47.0.dev0" +} diff --git a/checkpoint-15260/model-00001-of-00007.safetensors b/checkpoint-15260/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..897d8016fa0e1e5250a8e563879dca3cb5c2949b --- /dev/null +++ b/checkpoint-15260/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e7546a0b591d1aebcb5a7a4ddc93298af1cc66f8cc0005a361a3725e93ae6f0 +size 4886466168 diff --git a/checkpoint-15260/model-00002-of-00007.safetensors b/checkpoint-15260/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961 --- /dev/null +++ b/checkpoint-15260/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64 +size 4832007448 diff --git a/checkpoint-15260/model-00003-of-00007.safetensors b/checkpoint-15260/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff --- /dev/null +++ b/checkpoint-15260/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97 +size 4999813112 diff --git a/checkpoint-15260/model-00004-of-00007.safetensors b/checkpoint-15260/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a --- /dev/null +++ b/checkpoint-15260/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042 +size 4999813128 diff --git a/checkpoint-15260/model-00005-of-00007.safetensors b/checkpoint-15260/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89 --- /dev/null +++ b/checkpoint-15260/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7 +size 4832007496 diff --git a/checkpoint-15260/model-00006-of-00007.safetensors b/checkpoint-15260/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca407cbae64cc8f0030b7f31acf2243bf4d1ab3e --- /dev/null +++ b/checkpoint-15260/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df5d1993f96687d3eccecd21575ee99959b3c37e3e2fa3aa80d418fb95cbc368 +size 4999813120 diff --git a/checkpoint-15260/model-00007-of-00007.safetensors b/checkpoint-15260/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..90f37e0f21263f934b2e15e7150e4e700babe776 --- /dev/null +++ b/checkpoint-15260/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b19907b3ea724b991bedf1922624123ba04d5ae5406f392a3d86198358d3ff2 +size 2571158184 diff --git a/checkpoint-15260/model.safetensors.index.json b/checkpoint-15260/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-15260/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-15260/optimizer.pt b/checkpoint-15260/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..954e7bb315aede4edbaa970b037ee355869f944c --- /dev/null +++ b/checkpoint-15260/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ca25d4f5128b30a1e43381a6319446763e5218dd8258ae8a8d5278a3f895ed4 +size 15385036334 diff --git a/checkpoint-15260/rng_state.pth b/checkpoint-15260/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c --- /dev/null +++ b/checkpoint-15260/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69 +size 14244 diff --git a/checkpoint-15260/scheduler.pt b/checkpoint-15260/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f65fadce527ecb01fb7d7fd7ef901b9b082813 --- /dev/null +++ b/checkpoint-15260/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce335347efaebf038b8915bc0a8d2d587a4a1aa08ad0e015b4bc7cc4fba634e +size 1064 diff --git a/checkpoint-15260/trainer_state.json b/checkpoint-15260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7afd2d7cb04092f504d209b647e1edebebb69382 --- /dev/null +++ b/checkpoint-15260/trainer_state.json @@ -0,0 +1,3477 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2203546467972044, + "eval_steps": 500, + "global_step": 15260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004476405013573615, + "grad_norm": 4.6696085929870605, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9366, + "step": 31 + }, + { + "epoch": 0.000895281002714723, + "grad_norm": 4.250915050506592, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.9002, + "step": 62 + }, + { + "epoch": 0.0013429215040720846, + "grad_norm": 4.424270153045654, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8843, + "step": 93 + }, + { + "epoch": 0.001790562005429446, + "grad_norm": 4.56964635848999, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8717, + "step": 124 + }, + { + "epoch": 0.0022382025067868077, + "grad_norm": 4.051624298095703, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8711, + "step": 155 + }, + { + "epoch": 0.002685843008144169, + "grad_norm": 3.98006272315979, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8628, + "step": 186 + }, + { + "epoch": 0.0031334835095015307, + "grad_norm": 4.4158406257629395, + "learning_rate": 7.110091743119267e-06, + "loss": 0.871, + "step": 217 + }, + { + "epoch": 0.003581124010858892, + "grad_norm": 4.681333541870117, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8593, + "step": 248 + }, + { + "epoch": 0.004028764512216254, + "grad_norm": 3.8057820796966553, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8558, + "step": 279 + }, + { + "epoch": 0.0044764050135736155, + "grad_norm": 4.523633003234863, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8676, + "step": 310 + }, + { + "epoch": 0.0049240455149309765, + "grad_norm": 3.7387187480926514, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8585, + "step": 341 + }, + { + "epoch": 0.005371686016288338, + "grad_norm": 4.187750816345215, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8592, + "step": 372 + }, + { + "epoch": 0.005819326517645699, + "grad_norm": 3.782883644104004, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8449, + "step": 403 + }, + { + "epoch": 0.006266967019003061, + "grad_norm": 3.577796459197998, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8418, + "step": 434 + }, + { + "epoch": 0.006714607520360423, + "grad_norm": 3.1408321857452393, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8577, + "step": 465 + }, + { + "epoch": 0.007162248021717784, + "grad_norm": 4.090081691741943, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8439, + "step": 496 + }, + { + "epoch": 0.007609888523075146, + "grad_norm": 2.7458200454711914, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8468, + "step": 527 + }, + { + "epoch": 0.008057529024432507, + "grad_norm": 3.703225612640381, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8385, + "step": 558 + }, + { + "epoch": 0.008505169525789868, + "grad_norm": 3.134650230407715, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8418, + "step": 589 + }, + { + "epoch": 0.008952810027147231, + "grad_norm": 3.762680768966675, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8312, + "step": 620 + }, + { + "epoch": 0.009400450528504592, + "grad_norm": 3.751004457473755, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8251, + "step": 651 + }, + { + "epoch": 0.009848091029861953, + "grad_norm": 3.2268712520599365, + "learning_rate": 2.234600262123198e-05, + "loss": 0.8369, + "step": 682 + }, + { + "epoch": 0.010295731531219316, + "grad_norm": 3.5854289531707764, + "learning_rate": 2.336173001310616e-05, + "loss": 0.826, + "step": 713 + }, + { + "epoch": 0.010743372032576677, + "grad_norm": 3.9910435676574707, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8168, + "step": 744 + }, + { + "epoch": 0.011191012533934038, + "grad_norm": 3.3059303760528564, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.011638653035291399, + "grad_norm": 3.4081811904907227, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.817, + "step": 806 + }, + { + "epoch": 0.012086293536648762, + "grad_norm": 3.2740163803100586, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8195, + "step": 837 + }, + { + "epoch": 0.012533934038006123, + "grad_norm": 2.7206223011016846, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8188, + "step": 868 + }, + { + "epoch": 0.012981574539363484, + "grad_norm": 2.7005629539489746, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8127, + "step": 899 + }, + { + "epoch": 0.013429215040720846, + "grad_norm": 2.970745325088501, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8126, + "step": 930 + }, + { + "epoch": 0.013876855542078207, + "grad_norm": 2.4761953353881836, + "learning_rate": 3.148754914809961e-05, + "loss": 0.82, + "step": 961 + }, + { + "epoch": 0.014324496043435568, + "grad_norm": 2.8555397987365723, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8166, + "step": 992 + }, + { + "epoch": 0.01477213654479293, + "grad_norm": 2.8124194145202637, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8057, + "step": 1023 + }, + { + "epoch": 0.015219777046150292, + "grad_norm": 2.353851556777954, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8064, + "step": 1054 + }, + { + "epoch": 0.015667417547507653, + "grad_norm": 3.0127620697021484, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8086, + "step": 1085 + }, + { + "epoch": 0.016115058048865014, + "grad_norm": 2.792686939239502, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.8152, + "step": 1116 + }, + { + "epoch": 0.016562698550222375, + "grad_norm": 2.407134532928467, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7949, + "step": 1147 + }, + { + "epoch": 0.017010339051579736, + "grad_norm": 2.6921393871307373, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.804, + "step": 1178 + }, + { + "epoch": 0.0174579795529371, + "grad_norm": 2.3015975952148438, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7944, + "step": 1209 + }, + { + "epoch": 0.017905620054294462, + "grad_norm": 2.8116579055786133, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7977, + "step": 1240 + }, + { + "epoch": 0.018353260555651823, + "grad_norm": 2.5720036029815674, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7854, + "step": 1271 + }, + { + "epoch": 0.018800901057009184, + "grad_norm": 2.0802650451660156, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7892, + "step": 1302 + }, + { + "epoch": 0.019248541558366545, + "grad_norm": 2.4343624114990234, + "learning_rate": 4.367627785058978e-05, + "loss": 0.7897, + "step": 1333 + }, + { + "epoch": 0.019696182059723906, + "grad_norm": 2.509686231613159, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7855, + "step": 1364 + }, + { + "epoch": 0.020143822561081267, + "grad_norm": 2.626512289047241, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7873, + "step": 1395 + }, + { + "epoch": 0.02059146306243863, + "grad_norm": 2.8619399070739746, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7891, + "step": 1426 + }, + { + "epoch": 0.021039103563795993, + "grad_norm": 2.724792718887329, + "learning_rate": 4.77391874180865e-05, + "loss": 0.782, + "step": 1457 + }, + { + "epoch": 0.021486744065153354, + "grad_norm": 2.6659562587738037, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7856, + "step": 1488 + }, + { + "epoch": 0.021934384566510715, + "grad_norm": 2.646078586578369, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7748, + "step": 1519 + }, + { + "epoch": 0.022382025067868076, + "grad_norm": 2.429288387298584, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7722, + "step": 1550 + }, + { + "epoch": 0.022829665569225437, + "grad_norm": 1.9933409690856934, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7874, + "step": 1581 + }, + { + "epoch": 0.023277306070582798, + "grad_norm": 2.314889907836914, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7757, + "step": 1612 + }, + { + "epoch": 0.023724946571940162, + "grad_norm": 2.2891199588775635, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7713, + "step": 1643 + }, + { + "epoch": 0.024172587073297523, + "grad_norm": 2.4892444610595703, + "learning_rate": 4.999678487776908e-05, + "loss": 0.7687, + "step": 1674 + }, + { + "epoch": 0.024620227574654884, + "grad_norm": 2.3015685081481934, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7721, + "step": 1705 + }, + { + "epoch": 0.025067868076012245, + "grad_norm": 2.278954029083252, + "learning_rate": 4.999352703566763e-05, + "loss": 0.7741, + "step": 1736 + }, + { + "epoch": 0.025515508577369606, + "grad_norm": 1.7260370254516602, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7681, + "step": 1767 + }, + { + "epoch": 0.025963149078726967, + "grad_norm": 2.0179309844970703, + "learning_rate": 4.998914100252672e-05, + "loss": 0.7604, + "step": 1798 + }, + { + "epoch": 0.02641078958008433, + "grad_norm": 2.53022837638855, + "learning_rate": 4.998652497419696e-05, + "loss": 0.7598, + "step": 1829 + }, + { + "epoch": 0.026858430081441693, + "grad_norm": 1.859253168106079, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7606, + "step": 1860 + }, + { + "epoch": 0.027306070582799054, + "grad_norm": 1.759303331375122, + "learning_rate": 4.998044704162613e-05, + "loss": 0.7532, + "step": 1891 + }, + { + "epoch": 0.027753711084156415, + "grad_norm": 2.4389419555664062, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.7646, + "step": 1922 + }, + { + "epoch": 0.028201351585513776, + "grad_norm": 2.155348777770996, + "learning_rate": 4.997324150843799e-05, + "loss": 0.7569, + "step": 1953 + }, + { + "epoch": 0.028648992086871137, + "grad_norm": 2.0138537883758545, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7677, + "step": 1984 + }, + { + "epoch": 0.029096632588228498, + "grad_norm": 2.5275282859802246, + "learning_rate": 4.996490869988546e-05, + "loss": 0.7519, + "step": 2015 + }, + { + "epoch": 0.02954427308958586, + "grad_norm": 1.8147333860397339, + "learning_rate": 4.996031968290326e-05, + "loss": 0.7509, + "step": 2046 + }, + { + "epoch": 0.029991913590943223, + "grad_norm": 2.1941769123077393, + "learning_rate": 4.995544899210594e-05, + "loss": 0.754, + "step": 2077 + }, + { + "epoch": 0.030439554092300584, + "grad_norm": 1.8953059911727905, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.747, + "step": 2108 + }, + { + "epoch": 0.030887194593657945, + "grad_norm": 3.3973031044006348, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7513, + "step": 2139 + }, + { + "epoch": 0.031334835095015307, + "grad_norm": 2.66795015335083, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.7469, + "step": 2170 + }, + { + "epoch": 0.03178247559637267, + "grad_norm": 1.6254230737686157, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.7297, + "step": 2201 + }, + { + "epoch": 0.03223011609773003, + "grad_norm": 1.822745680809021, + "learning_rate": 4.992687246588743e-05, + "loss": 0.754, + "step": 2232 + }, + { + "epoch": 0.03267775659908739, + "grad_norm": 1.6898781061172485, + "learning_rate": 4.992031299767347e-05, + "loss": 0.7478, + "step": 2263 + }, + { + "epoch": 0.03312539710044475, + "grad_norm": 1.799280047416687, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.746, + "step": 2294 + }, + { + "epoch": 0.033573037601802115, + "grad_norm": 2.2501840591430664, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.7408, + "step": 2325 + }, + { + "epoch": 0.03402067810315947, + "grad_norm": 2.3315324783325195, + "learning_rate": 4.989894757091861e-05, + "loss": 0.7301, + "step": 2356 + }, + { + "epoch": 0.03446831860451684, + "grad_norm": 1.5820438861846924, + "learning_rate": 4.989126368919158e-05, + "loss": 0.7305, + "step": 2387 + }, + { + "epoch": 0.0349159591058742, + "grad_norm": 2.5696022510528564, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7337, + "step": 2418 + }, + { + "epoch": 0.03536359960723156, + "grad_norm": 1.8880938291549683, + "learning_rate": 4.987505333203608e-05, + "loss": 0.7385, + "step": 2449 + }, + { + "epoch": 0.035811240108588924, + "grad_norm": 2.6148738861083984, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.7292, + "step": 2480 + }, + { + "epoch": 0.03625888060994628, + "grad_norm": 1.6925290822982788, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.7344, + "step": 2511 + }, + { + "epoch": 0.036706521111303646, + "grad_norm": 1.7675210237503052, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.7354, + "step": 2542 + }, + { + "epoch": 0.037154161612661, + "grad_norm": 2.1053173542022705, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.7272, + "step": 2573 + }, + { + "epoch": 0.03760180211401837, + "grad_norm": 1.9718347787857056, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.7336, + "step": 2604 + }, + { + "epoch": 0.03804944261537573, + "grad_norm": 1.5777671337127686, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7182, + "step": 2635 + }, + { + "epoch": 0.03849708311673309, + "grad_norm": 1.905127763748169, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7296, + "step": 2666 + }, + { + "epoch": 0.038944723618090454, + "grad_norm": 1.63962721824646, + "learning_rate": 4.979899154855234e-05, + "loss": 0.7249, + "step": 2697 + }, + { + "epoch": 0.03939236411944781, + "grad_norm": 1.584331750869751, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.7345, + "step": 2728 + }, + { + "epoch": 0.039840004620805176, + "grad_norm": 1.9111014604568481, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7225, + "step": 2759 + }, + { + "epoch": 0.040287645122162534, + "grad_norm": 2.3216073513031006, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7207, + "step": 2790 + }, + { + "epoch": 0.0407352856235199, + "grad_norm": 1.6002410650253296, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7228, + "step": 2821 + }, + { + "epoch": 0.04118292612487726, + "grad_norm": 2.104731798171997, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7127, + "step": 2852 + }, + { + "epoch": 0.04163056662623462, + "grad_norm": 1.7114660739898682, + "learning_rate": 4.973018858007122e-05, + "loss": 0.7283, + "step": 2883 + }, + { + "epoch": 0.042078207127591985, + "grad_norm": 1.948133945465088, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7209, + "step": 2914 + }, + { + "epoch": 0.04252584762894934, + "grad_norm": 1.621764898300171, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7272, + "step": 2945 + }, + { + "epoch": 0.04297348813030671, + "grad_norm": 1.6967558860778809, + "learning_rate": 4.969201777632205e-05, + "loss": 0.7191, + "step": 2976 + }, + { + "epoch": 0.043421128631664065, + "grad_norm": 1.6656996011734009, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7205, + "step": 3007 + }, + { + "epoch": 0.04386876913302143, + "grad_norm": 2.151475191116333, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7168, + "step": 3038 + }, + { + "epoch": 0.044316409634378794, + "grad_norm": 2.213109016418457, + "learning_rate": 4.965133917685858e-05, + "loss": 0.7139, + "step": 3069 + }, + { + "epoch": 0.04476405013573615, + "grad_norm": 1.5380377769470215, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7237, + "step": 3100 + }, + { + "epoch": 0.045211690637093516, + "grad_norm": 2.312377452850342, + "learning_rate": 4.962282892045718e-05, + "loss": 0.7156, + "step": 3131 + }, + { + "epoch": 0.04565933113845087, + "grad_norm": 1.7220717668533325, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.7122, + "step": 3162 + }, + { + "epoch": 0.04610697163980824, + "grad_norm": 1.802856206893921, + "learning_rate": 4.959320720608049e-05, + "loss": 0.7128, + "step": 3193 + }, + { + "epoch": 0.046554612141165595, + "grad_norm": 1.6629964113235474, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.7172, + "step": 3224 + }, + { + "epoch": 0.04700225264252296, + "grad_norm": 3.440115213394165, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7213, + "step": 3255 + }, + { + "epoch": 0.047449893143880324, + "grad_norm": 1.5721139907836914, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7148, + "step": 3286 + }, + { + "epoch": 0.04789753364523768, + "grad_norm": 2.0920398235321045, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7145, + "step": 3317 + }, + { + "epoch": 0.048345174146595046, + "grad_norm": 1.666566014289856, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7095, + "step": 3348 + }, + { + "epoch": 0.048792814647952404, + "grad_norm": 1.8222129344940186, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7138, + "step": 3379 + }, + { + "epoch": 0.04924045514930977, + "grad_norm": 1.7302964925765991, + "learning_rate": 4.948079823064559e-05, + "loss": 0.7017, + "step": 3410 + }, + { + "epoch": 0.049688095650667126, + "grad_norm": 1.7338463068008423, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6979, + "step": 3441 + }, + { + "epoch": 0.05013573615202449, + "grad_norm": 1.5637450218200684, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7011, + "step": 3472 + }, + { + "epoch": 0.050583376653381855, + "grad_norm": 1.5632222890853882, + "learning_rate": 4.942847531574167e-05, + "loss": 0.704, + "step": 3503 + }, + { + "epoch": 0.05103101715473921, + "grad_norm": 1.588402509689331, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7011, + "step": 3534 + }, + { + "epoch": 0.05147865765609658, + "grad_norm": 1.8840582370758057, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7016, + "step": 3565 + }, + { + "epoch": 0.051926298157453935, + "grad_norm": 1.2702268362045288, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7004, + "step": 3596 + }, + { + "epoch": 0.0523739386588113, + "grad_norm": 1.3812692165374756, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7046, + "step": 3627 + }, + { + "epoch": 0.05282157916016866, + "grad_norm": 1.7257345914840698, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6976, + "step": 3658 + }, + { + "epoch": 0.05326921966152602, + "grad_norm": 1.7458925247192383, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6997, + "step": 3689 + }, + { + "epoch": 0.053716860162883386, + "grad_norm": 2.1996099948883057, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6991, + "step": 3720 + }, + { + "epoch": 0.05416450066424074, + "grad_norm": 1.6615021228790283, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7005, + "step": 3751 + }, + { + "epoch": 0.05461214116559811, + "grad_norm": 1.6276952028274536, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7097, + "step": 3782 + }, + { + "epoch": 0.055059781666955465, + "grad_norm": 1.758227825164795, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6955, + "step": 3813 + }, + { + "epoch": 0.05550742216831283, + "grad_norm": 1.195280909538269, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7073, + "step": 3844 + }, + { + "epoch": 0.05595506266967019, + "grad_norm": 1.6281015872955322, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7022, + "step": 3875 + }, + { + "epoch": 0.05640270317102755, + "grad_norm": 1.3543150424957275, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7036, + "step": 3906 + }, + { + "epoch": 0.056850343672384916, + "grad_norm": 2.16947078704834, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7007, + "step": 3937 + }, + { + "epoch": 0.057297984173742274, + "grad_norm": 1.324578881263733, + "learning_rate": 4.912976038673786e-05, + "loss": 0.6941, + "step": 3968 + }, + { + "epoch": 0.05774562467509964, + "grad_norm": 1.9811108112335205, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6908, + "step": 3999 + }, + { + "epoch": 0.058193265176456996, + "grad_norm": 1.2975554466247559, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6936, + "step": 4030 + }, + { + "epoch": 0.05864090567781436, + "grad_norm": 1.583282232284546, + "learning_rate": 4.906263980464644e-05, + "loss": 0.698, + "step": 4061 + }, + { + "epoch": 0.05908854617917172, + "grad_norm": 1.3532944917678833, + "learning_rate": 4.903972285033178e-05, + "loss": 0.7049, + "step": 4092 + }, + { + "epoch": 0.05953618668052908, + "grad_norm": 2.1245481967926025, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7016, + "step": 4123 + }, + { + "epoch": 0.05998382718188645, + "grad_norm": 1.6913797855377197, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6981, + "step": 4154 + }, + { + "epoch": 0.060431467683243804, + "grad_norm": 1.51249098777771, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6955, + "step": 4185 + }, + { + "epoch": 0.06087910818460117, + "grad_norm": 1.3880395889282227, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6934, + "step": 4216 + }, + { + "epoch": 0.061326748685958526, + "grad_norm": 1.6354159116744995, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6938, + "step": 4247 + }, + { + "epoch": 0.06177438918731589, + "grad_norm": 2.126742362976074, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7003, + "step": 4278 + }, + { + "epoch": 0.06222202968867325, + "grad_norm": 1.7903707027435303, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6885, + "step": 4309 + }, + { + "epoch": 0.06266967019003061, + "grad_norm": 1.537806510925293, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6892, + "step": 4340 + }, + { + "epoch": 0.06311731069138797, + "grad_norm": 1.6445434093475342, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6843, + "step": 4371 + }, + { + "epoch": 0.06356495119274534, + "grad_norm": 1.555373191833496, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6899, + "step": 4402 + }, + { + "epoch": 0.0640125916941027, + "grad_norm": 1.8370277881622314, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6813, + "step": 4433 + }, + { + "epoch": 0.06446023219546006, + "grad_norm": 1.3132514953613281, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6832, + "step": 4464 + }, + { + "epoch": 0.06490787269681741, + "grad_norm": 1.3186298608779907, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6879, + "step": 4495 + }, + { + "epoch": 0.06535551319817479, + "grad_norm": 1.4360268115997314, + "learning_rate": 4.869052379269719e-05, + "loss": 0.69, + "step": 4526 + }, + { + "epoch": 0.06580315369953214, + "grad_norm": 1.670765995979309, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6865, + "step": 4557 + }, + { + "epoch": 0.0662507942008895, + "grad_norm": 1.7548723220825195, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6852, + "step": 4588 + }, + { + "epoch": 0.06669843470224687, + "grad_norm": 1.5086426734924316, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6894, + "step": 4619 + }, + { + "epoch": 0.06714607520360423, + "grad_norm": 1.3140665292739868, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6812, + "step": 4650 + }, + { + "epoch": 0.06759371570496159, + "grad_norm": 1.4212454557418823, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6872, + "step": 4681 + }, + { + "epoch": 0.06804135620631895, + "grad_norm": 1.3034414052963257, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6901, + "step": 4712 + }, + { + "epoch": 0.06848899670767632, + "grad_norm": 1.3741438388824463, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6875, + "step": 4743 + }, + { + "epoch": 0.06893663720903367, + "grad_norm": 1.7262542247772217, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6868, + "step": 4774 + }, + { + "epoch": 0.06938427771039103, + "grad_norm": 1.3293650150299072, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6891, + "step": 4805 + }, + { + "epoch": 0.0698319182117484, + "grad_norm": 1.3448151350021362, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6765, + "step": 4836 + }, + { + "epoch": 0.07027955871310576, + "grad_norm": 2.961280584335327, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6939, + "step": 4867 + }, + { + "epoch": 0.07072719921446312, + "grad_norm": 1.8265361785888672, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.677, + "step": 4898 + }, + { + "epoch": 0.07117483971582048, + "grad_norm": 1.490349531173706, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6778, + "step": 4929 + }, + { + "epoch": 0.07162248021717785, + "grad_norm": 1.3669307231903076, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6746, + "step": 4960 + }, + { + "epoch": 0.0720701207185352, + "grad_norm": 1.3995884656906128, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6757, + "step": 4991 + }, + { + "epoch": 0.07251776121989256, + "grad_norm": 1.1217372417449951, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6832, + "step": 5022 + }, + { + "epoch": 0.07296540172124993, + "grad_norm": 1.2192097902297974, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6868, + "step": 5053 + }, + { + "epoch": 0.07341304222260729, + "grad_norm": 1.5045067071914673, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6793, + "step": 5084 + }, + { + "epoch": 0.07386068272396465, + "grad_norm": 1.7127234935760498, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6797, + "step": 5115 + }, + { + "epoch": 0.074308323225322, + "grad_norm": 1.846561312675476, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6767, + "step": 5146 + }, + { + "epoch": 0.07475596372667938, + "grad_norm": 1.3076797723770142, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6855, + "step": 5177 + }, + { + "epoch": 0.07520360422803674, + "grad_norm": 1.4170383214950562, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.681, + "step": 5208 + }, + { + "epoch": 0.0756512447293941, + "grad_norm": 1.2504942417144775, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6778, + "step": 5239 + }, + { + "epoch": 0.07609888523075146, + "grad_norm": 1.1522283554077148, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6749, + "step": 5270 + }, + { + "epoch": 0.07654652573210882, + "grad_norm": 1.6351525783538818, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6745, + "step": 5301 + }, + { + "epoch": 0.07699416623346618, + "grad_norm": 1.5093014240264893, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6747, + "step": 5332 + }, + { + "epoch": 0.07744180673482354, + "grad_norm": 1.362160563468933, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.673, + "step": 5363 + }, + { + "epoch": 0.07788944723618091, + "grad_norm": 1.2021727561950684, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6768, + "step": 5394 + }, + { + "epoch": 0.07833708773753827, + "grad_norm": 2.1543540954589844, + "learning_rate": 4.780153554146274e-05, + "loss": 0.672, + "step": 5425 + }, + { + "epoch": 0.07878472823889562, + "grad_norm": 1.882712721824646, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6926, + "step": 5456 + }, + { + "epoch": 0.079232368740253, + "grad_norm": 1.3975650072097778, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6747, + "step": 5487 + }, + { + "epoch": 0.07968000924161035, + "grad_norm": 1.3912913799285889, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6756, + "step": 5518 + }, + { + "epoch": 0.08012764974296771, + "grad_norm": 1.7227635383605957, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6694, + "step": 5549 + }, + { + "epoch": 0.08057529024432507, + "grad_norm": 1.3151129484176636, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6715, + "step": 5580 + }, + { + "epoch": 0.08102293074568244, + "grad_norm": 1.0972425937652588, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6661, + "step": 5611 + }, + { + "epoch": 0.0814705712470398, + "grad_norm": 1.2346575260162354, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.66, + "step": 5642 + }, + { + "epoch": 0.08191821174839715, + "grad_norm": 1.4536516666412354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6743, + "step": 5673 + }, + { + "epoch": 0.08236585224975453, + "grad_norm": 1.1361631155014038, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6791, + "step": 5704 + }, + { + "epoch": 0.08281349275111188, + "grad_norm": 1.2605111598968506, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6797, + "step": 5735 + }, + { + "epoch": 0.08326113325246924, + "grad_norm": 1.2355903387069702, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6642, + "step": 5766 + }, + { + "epoch": 0.0837087737538266, + "grad_norm": 1.6677048206329346, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6759, + "step": 5797 + }, + { + "epoch": 0.08415641425518397, + "grad_norm": 1.4781981706619263, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6707, + "step": 5828 + }, + { + "epoch": 0.08460405475654133, + "grad_norm": 1.138583779335022, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6636, + "step": 5859 + }, + { + "epoch": 0.08505169525789869, + "grad_norm": 1.529036283493042, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6705, + "step": 5890 + }, + { + "epoch": 0.08549933575925606, + "grad_norm": 1.3216760158538818, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6714, + "step": 5921 + }, + { + "epoch": 0.08594697626061341, + "grad_norm": 1.1820168495178223, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6757, + "step": 5952 + }, + { + "epoch": 0.08639461676197077, + "grad_norm": 1.393571138381958, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6672, + "step": 5983 + }, + { + "epoch": 0.08684225726332813, + "grad_norm": 1.4574682712554932, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6655, + "step": 6014 + }, + { + "epoch": 0.0872898977646855, + "grad_norm": 1.138645887374878, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6663, + "step": 6045 + }, + { + "epoch": 0.08773753826604286, + "grad_norm": 1.7602777481079102, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6695, + "step": 6076 + }, + { + "epoch": 0.08818517876740022, + "grad_norm": 1.2323459386825562, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6636, + "step": 6107 + }, + { + "epoch": 0.08863281926875759, + "grad_norm": 1.6881431341171265, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6653, + "step": 6138 + }, + { + "epoch": 0.08908045977011494, + "grad_norm": 1.391417384147644, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6636, + "step": 6169 + }, + { + "epoch": 0.0895281002714723, + "grad_norm": 1.3066257238388062, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6777, + "step": 6200 + }, + { + "epoch": 0.08997574077282966, + "grad_norm": 1.2001326084136963, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6683, + "step": 6231 + }, + { + "epoch": 0.09042338127418703, + "grad_norm": 1.4361172914505005, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6634, + "step": 6262 + }, + { + "epoch": 0.09087102177554439, + "grad_norm": 8.04520320892334, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6752, + "step": 6293 + }, + { + "epoch": 0.09131866227690175, + "grad_norm": 1.4874210357666016, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6733, + "step": 6324 + }, + { + "epoch": 0.09176630277825912, + "grad_norm": 1.234491229057312, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.668, + "step": 6355 + }, + { + "epoch": 0.09221394327961648, + "grad_norm": 1.2088687419891357, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6705, + "step": 6386 + }, + { + "epoch": 0.09266158378097383, + "grad_norm": 1.1937814950942993, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6853, + "step": 6417 + }, + { + "epoch": 0.09310922428233119, + "grad_norm": 1.5205374956130981, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.685, + "step": 6448 + }, + { + "epoch": 0.09355686478368856, + "grad_norm": 1.2221660614013672, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6622, + "step": 6479 + }, + { + "epoch": 0.09400450528504592, + "grad_norm": 1.0762608051300049, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.669, + "step": 6510 + }, + { + "epoch": 0.09445214578640328, + "grad_norm": 1.4416946172714233, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6674, + "step": 6541 + }, + { + "epoch": 0.09489978628776065, + "grad_norm": 1.559158205986023, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6681, + "step": 6572 + }, + { + "epoch": 0.095347426789118, + "grad_norm": 1.3833891153335571, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6681, + "step": 6603 + }, + { + "epoch": 0.09579506729047536, + "grad_norm": 1.6753300428390503, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6658, + "step": 6634 + }, + { + "epoch": 0.09624270779183272, + "grad_norm": 1.951198697090149, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6654, + "step": 6665 + }, + { + "epoch": 0.09669034829319009, + "grad_norm": 1.2356919050216675, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6651, + "step": 6696 + }, + { + "epoch": 0.09713798879454745, + "grad_norm": 1.2712593078613281, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6646, + "step": 6727 + }, + { + "epoch": 0.09758562929590481, + "grad_norm": 1.1935900449752808, + "learning_rate": 4.608840417313604e-05, + "loss": 0.674, + "step": 6758 + }, + { + "epoch": 0.09803326979726218, + "grad_norm": 1.1649430990219116, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6644, + "step": 6789 + }, + { + "epoch": 0.09848091029861954, + "grad_norm": 1.4281456470489502, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6673, + "step": 6820 + }, + { + "epoch": 0.0989285507999769, + "grad_norm": 1.3064521551132202, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6584, + "step": 6851 + }, + { + "epoch": 0.09937619130133425, + "grad_norm": 1.2546554803848267, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6612, + "step": 6882 + }, + { + "epoch": 0.09982383180269162, + "grad_norm": 1.1866974830627441, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6612, + "step": 6913 + }, + { + "epoch": 0.10027147230404898, + "grad_norm": 1.6166640520095825, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6551, + "step": 6944 + }, + { + "epoch": 0.10071911280540634, + "grad_norm": 1.5471700429916382, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6607, + "step": 6975 + }, + { + "epoch": 0.10116675330676371, + "grad_norm": 1.3361026048660278, + "learning_rate": 4.571999560773736e-05, + "loss": 0.666, + "step": 7006 + }, + { + "epoch": 0.10161439380812107, + "grad_norm": 1.2938140630722046, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6547, + "step": 7037 + }, + { + "epoch": 0.10206203430947842, + "grad_norm": 1.2688400745391846, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6624, + "step": 7068 + }, + { + "epoch": 0.10250967481083578, + "grad_norm": 1.6306285858154297, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6627, + "step": 7099 + }, + { + "epoch": 0.10295731531219315, + "grad_norm": 1.3346176147460938, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6577, + "step": 7130 + }, + { + "epoch": 0.10340495581355051, + "grad_norm": 1.0933984518051147, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6602, + "step": 7161 + }, + { + "epoch": 0.10385259631490787, + "grad_norm": 1.575859785079956, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6593, + "step": 7192 + }, + { + "epoch": 0.10430023681626524, + "grad_norm": 1.4265861511230469, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6564, + "step": 7223 + }, + { + "epoch": 0.1047478773176226, + "grad_norm": 1.737012267112732, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6593, + "step": 7254 + }, + { + "epoch": 0.10519551781897996, + "grad_norm": 1.0257115364074707, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6514, + "step": 7285 + }, + { + "epoch": 0.10564315832033731, + "grad_norm": 1.5043773651123047, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6635, + "step": 7316 + }, + { + "epoch": 0.10609079882169468, + "grad_norm": 1.5642234086990356, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6558, + "step": 7347 + }, + { + "epoch": 0.10653843932305204, + "grad_norm": 1.1821067333221436, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6686, + "step": 7378 + }, + { + "epoch": 0.1069860798244094, + "grad_norm": 1.5492759943008423, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6543, + "step": 7409 + }, + { + "epoch": 0.10743372032576677, + "grad_norm": 1.281914234161377, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6516, + "step": 7440 + }, + { + "epoch": 0.10788136082712413, + "grad_norm": 1.3318305015563965, + "learning_rate": 4.498911497712155e-05, + "loss": 0.656, + "step": 7471 + }, + { + "epoch": 0.10832900132848149, + "grad_norm": 1.3832449913024902, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6475, + "step": 7502 + }, + { + "epoch": 0.10877664182983884, + "grad_norm": 1.3547158241271973, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6594, + "step": 7533 + }, + { + "epoch": 0.10922428233119622, + "grad_norm": 1.4633681774139404, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6707, + "step": 7564 + }, + { + "epoch": 0.10967192283255357, + "grad_norm": 1.1781059503555298, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6626, + "step": 7595 + }, + { + "epoch": 0.11011956333391093, + "grad_norm": 1.4727883338928223, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.6535, + "step": 7626 + }, + { + "epoch": 0.1105672038352683, + "grad_norm": 1.3888640403747559, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6606, + "step": 7657 + }, + { + "epoch": 0.11101484433662566, + "grad_norm": 1.308769941329956, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6667, + "step": 7688 + }, + { + "epoch": 0.11146248483798302, + "grad_norm": 1.3579630851745605, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6621, + "step": 7719 + }, + { + "epoch": 0.11191012533934037, + "grad_norm": 1.1285645961761475, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6597, + "step": 7750 + }, + { + "epoch": 0.11235776584069775, + "grad_norm": 1.1144675016403198, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6638, + "step": 7781 + }, + { + "epoch": 0.1128054063420551, + "grad_norm": 1.1993658542633057, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6704, + "step": 7812 + }, + { + "epoch": 0.11325304684341246, + "grad_norm": 1.0608967542648315, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6573, + "step": 7843 + }, + { + "epoch": 0.11370068734476983, + "grad_norm": 1.1210070848464966, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6612, + "step": 7874 + }, + { + "epoch": 0.11414832784612719, + "grad_norm": 1.2345409393310547, + "learning_rate": 4.42611386459262e-05, + "loss": 0.65, + "step": 7905 + }, + { + "epoch": 0.11459596834748455, + "grad_norm": 1.077025294303894, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6595, + "step": 7936 + }, + { + "epoch": 0.1150436088488419, + "grad_norm": 1.2079277038574219, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.6442, + "step": 7967 + }, + { + "epoch": 0.11549124935019928, + "grad_norm": 1.6710035800933838, + "learning_rate": 4.409954541451762e-05, + "loss": 0.663, + "step": 7998 + }, + { + "epoch": 0.11593888985155663, + "grad_norm": 1.3124401569366455, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6512, + "step": 8029 + }, + { + "epoch": 0.11638653035291399, + "grad_norm": 1.644904375076294, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6587, + "step": 8060 + }, + { + "epoch": 0.11683417085427136, + "grad_norm": 1.181624174118042, + "learning_rate": 4.393601237573607e-05, + "loss": 0.653, + "step": 8091 + }, + { + "epoch": 0.11728181135562872, + "grad_norm": 1.4587918519973755, + "learning_rate": 4.388107315953628e-05, + "loss": 0.675, + "step": 8122 + }, + { + "epoch": 0.11772945185698608, + "grad_norm": 1.2147635221481323, + "learning_rate": 4.382592087299212e-05, + "loss": 0.6521, + "step": 8153 + }, + { + "epoch": 0.11817709235834344, + "grad_norm": 1.0448981523513794, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6541, + "step": 8184 + }, + { + "epoch": 0.11862473285970081, + "grad_norm": 1.4482290744781494, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6563, + "step": 8215 + }, + { + "epoch": 0.11907237336105816, + "grad_norm": 1.1621575355529785, + "learning_rate": 4.365919182713416e-05, + "loss": 0.656, + "step": 8246 + }, + { + "epoch": 0.11952001386241552, + "grad_norm": 1.1643873453140259, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6547, + "step": 8277 + }, + { + "epoch": 0.1199676543637729, + "grad_norm": 1.4016129970550537, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6582, + "step": 8308 + }, + { + "epoch": 0.12041529486513025, + "grad_norm": 1.4023685455322266, + "learning_rate": 4.349056769754021e-05, + "loss": 0.6621, + "step": 8339 + }, + { + "epoch": 0.12086293536648761, + "grad_norm": 1.3020285367965698, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6674, + "step": 8370 + }, + { + "epoch": 0.12131057586784497, + "grad_norm": 1.2162435054779053, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6614, + "step": 8401 + }, + { + "epoch": 0.12175821636920234, + "grad_norm": 1.1956969499588013, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6557, + "step": 8432 + }, + { + "epoch": 0.1222058568705597, + "grad_norm": 1.1723664999008179, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6633, + "step": 8463 + }, + { + "epoch": 0.12265349737191705, + "grad_norm": 1.113020658493042, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6577, + "step": 8494 + }, + { + "epoch": 0.12310113787327442, + "grad_norm": 1.2453004121780396, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6544, + "step": 8525 + }, + { + "epoch": 0.12354877837463178, + "grad_norm": 1.1493890285491943, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6533, + "step": 8556 + }, + { + "epoch": 0.12399641887598914, + "grad_norm": 1.4172496795654297, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6542, + "step": 8587 + }, + { + "epoch": 0.1244440593773465, + "grad_norm": 1.1840728521347046, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6476, + "step": 8618 + }, + { + "epoch": 0.12489169987870387, + "grad_norm": 1.3720282316207886, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6446, + "step": 8649 + }, + { + "epoch": 0.12533934038006123, + "grad_norm": 1.1705596446990967, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6537, + "step": 8680 + }, + { + "epoch": 0.1257869808814186, + "grad_norm": 1.0790083408355713, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6543, + "step": 8711 + }, + { + "epoch": 0.12623462138277594, + "grad_norm": 1.1207470893859863, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6468, + "step": 8742 + }, + { + "epoch": 0.1266822618841333, + "grad_norm": 1.1123065948486328, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6596, + "step": 8773 + }, + { + "epoch": 0.12712990238549068, + "grad_norm": 1.1579636335372925, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6458, + "step": 8804 + }, + { + "epoch": 0.12757754288684803, + "grad_norm": 1.3112802505493164, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6483, + "step": 8835 + }, + { + "epoch": 0.1280251833882054, + "grad_norm": 1.1104832887649536, + "learning_rate": 4.250007230372134e-05, + "loss": 0.645, + "step": 8866 + }, + { + "epoch": 0.12847282388956274, + "grad_norm": 1.0218713283538818, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6519, + "step": 8897 + }, + { + "epoch": 0.12892046439092011, + "grad_norm": 1.0053678750991821, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.6468, + "step": 8928 + }, + { + "epoch": 0.12936810489227749, + "grad_norm": 1.410933017730713, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6573, + "step": 8959 + }, + { + "epoch": 0.12981574539363483, + "grad_norm": 1.5001798868179321, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6483, + "step": 8990 + }, + { + "epoch": 0.1302633858949922, + "grad_norm": 1.112316608428955, + "learning_rate": 4.219774185874569e-05, + "loss": 0.6483, + "step": 9021 + }, + { + "epoch": 0.13071102639634957, + "grad_norm": 1.527464747428894, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6512, + "step": 9052 + }, + { + "epoch": 0.13115866689770692, + "grad_norm": 1.1075704097747803, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6577, + "step": 9083 + }, + { + "epoch": 0.1316063073990643, + "grad_norm": 0.9589399099349976, + "learning_rate": 4.201400923825648e-05, + "loss": 0.642, + "step": 9114 + }, + { + "epoch": 0.13205394790042166, + "grad_norm": 1.186531901359558, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6553, + "step": 9145 + }, + { + "epoch": 0.132501588401779, + "grad_norm": 1.1176280975341797, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6483, + "step": 9176 + }, + { + "epoch": 0.13294922890313637, + "grad_norm": 1.4222681522369385, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6462, + "step": 9207 + }, + { + "epoch": 0.13339686940449375, + "grad_norm": 1.1606040000915527, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6514, + "step": 9238 + }, + { + "epoch": 0.1338445099058511, + "grad_norm": 1.313774585723877, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6422, + "step": 9269 + }, + { + "epoch": 0.13429215040720846, + "grad_norm": 1.1994171142578125, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6444, + "step": 9300 + }, + { + "epoch": 0.1347397909085658, + "grad_norm": 0.9376353025436401, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6436, + "step": 9331 + }, + { + "epoch": 0.13518743140992318, + "grad_norm": 1.0350178480148315, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6447, + "step": 9362 + }, + { + "epoch": 0.13563507191128055, + "grad_norm": 1.3630082607269287, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6427, + "step": 9393 + }, + { + "epoch": 0.1360827124126379, + "grad_norm": 1.0362364053726196, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6443, + "step": 9424 + }, + { + "epoch": 0.13653035291399526, + "grad_norm": 1.1442211866378784, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.6523, + "step": 9455 + }, + { + "epoch": 0.13697799341535263, + "grad_norm": 1.4196938276290894, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6501, + "step": 9486 + }, + { + "epoch": 0.13742563391670998, + "grad_norm": 1.3853130340576172, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.6431, + "step": 9517 + }, + { + "epoch": 0.13787327441806735, + "grad_norm": 1.0350273847579956, + "learning_rate": 4.113404868280107e-05, + "loss": 0.6436, + "step": 9548 + }, + { + "epoch": 0.13832091491942472, + "grad_norm": 1.0520857572555542, + "learning_rate": 4.106980553625457e-05, + "loss": 0.6436, + "step": 9579 + }, + { + "epoch": 0.13876855542078206, + "grad_norm": 1.127038836479187, + "learning_rate": 4.100538104413674e-05, + "loss": 0.639, + "step": 9610 + }, + { + "epoch": 0.13921619592213944, + "grad_norm": 1.1070880889892578, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6366, + "step": 9641 + }, + { + "epoch": 0.1396638364234968, + "grad_norm": 1.3045605421066284, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6496, + "step": 9672 + }, + { + "epoch": 0.14011147692485415, + "grad_norm": 1.234647512435913, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6383, + "step": 9703 + }, + { + "epoch": 0.14055911742621152, + "grad_norm": 1.1154453754425049, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6454, + "step": 9734 + }, + { + "epoch": 0.14100675792756887, + "grad_norm": 1.2422186136245728, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6404, + "step": 9765 + }, + { + "epoch": 0.14145439842892624, + "grad_norm": 1.2128278017044067, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6376, + "step": 9796 + }, + { + "epoch": 0.1419020389302836, + "grad_norm": 1.3140804767608643, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.6316, + "step": 9827 + }, + { + "epoch": 0.14234967943164095, + "grad_norm": 1.9198187589645386, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6383, + "step": 9858 + }, + { + "epoch": 0.14279731993299832, + "grad_norm": 1.144679307937622, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.6383, + "step": 9889 + }, + { + "epoch": 0.1432449604343557, + "grad_norm": 1.1679338216781616, + "learning_rate": 4.035132306369438e-05, + "loss": 0.6414, + "step": 9920 + }, + { + "epoch": 0.14369260093571304, + "grad_norm": 0.9563717246055603, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6327, + "step": 9951 + }, + { + "epoch": 0.1441402414370704, + "grad_norm": 1.277036428451538, + "learning_rate": 4.021840884378864e-05, + "loss": 0.6365, + "step": 9982 + }, + { + "epoch": 0.14458788193842778, + "grad_norm": 0.9835182428359985, + "learning_rate": 4.015169375185633e-05, + "loss": 0.638, + "step": 10013 + }, + { + "epoch": 0.14503552243978513, + "grad_norm": 1.090118646621704, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.6437, + "step": 10044 + }, + { + "epoch": 0.1454831629411425, + "grad_norm": 1.1823488473892212, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6326, + "step": 10075 + }, + { + "epoch": 0.14593080344249987, + "grad_norm": 1.070318341255188, + "learning_rate": 3.995052558835377e-05, + "loss": 0.6362, + "step": 10106 + }, + { + "epoch": 0.1463784439438572, + "grad_norm": 1.2451491355895996, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6388, + "step": 10137 + }, + { + "epoch": 0.14682608444521458, + "grad_norm": 1.2417365312576294, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6309, + "step": 10168 + }, + { + "epoch": 0.14727372494657193, + "grad_norm": 1.251518726348877, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6365, + "step": 10199 + }, + { + "epoch": 0.1477213654479293, + "grad_norm": 1.359750747680664, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.6386, + "step": 10230 + }, + { + "epoch": 0.14816900594928667, + "grad_norm": 1.1073262691497803, + "learning_rate": 3.961188120762596e-05, + "loss": 0.635, + "step": 10261 + }, + { + "epoch": 0.148616646450644, + "grad_norm": 0.9850608706474304, + "learning_rate": 3.954365458554938e-05, + "loss": 0.6389, + "step": 10292 + }, + { + "epoch": 0.14906428695200138, + "grad_norm": 1.2984429597854614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.6317, + "step": 10323 + }, + { + "epoch": 0.14951192745335876, + "grad_norm": 1.1622575521469116, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6393, + "step": 10354 + }, + { + "epoch": 0.1499595679547161, + "grad_norm": 1.0856871604919434, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6351, + "step": 10385 + }, + { + "epoch": 0.15040720845607347, + "grad_norm": 1.0153882503509521, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6282, + "step": 10416 + }, + { + "epoch": 0.15085484895743084, + "grad_norm": 1.2039254903793335, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6365, + "step": 10447 + }, + { + "epoch": 0.1513024894587882, + "grad_norm": 1.1179555654525757, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6311, + "step": 10478 + }, + { + "epoch": 0.15175012996014556, + "grad_norm": 1.090903639793396, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.6401, + "step": 10509 + }, + { + "epoch": 0.15219777046150293, + "grad_norm": 0.9228240251541138, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.6323, + "step": 10540 + }, + { + "epoch": 0.15264541096286027, + "grad_norm": 1.0772743225097656, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.637, + "step": 10571 + }, + { + "epoch": 0.15309305146421764, + "grad_norm": 1.0854771137237549, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6375, + "step": 10602 + }, + { + "epoch": 0.153540691965575, + "grad_norm": 1.3902987241744995, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.6406, + "step": 10633 + }, + { + "epoch": 0.15398833246693236, + "grad_norm": 1.180351734161377, + "learning_rate": 3.871235554965218e-05, + "loss": 0.626, + "step": 10664 + }, + { + "epoch": 0.15443597296828973, + "grad_norm": 1.1136449575424194, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6395, + "step": 10695 + }, + { + "epoch": 0.15488361346964707, + "grad_norm": 1.1770708560943604, + "learning_rate": 3.857160259406107e-05, + "loss": 0.633, + "step": 10726 + }, + { + "epoch": 0.15533125397100445, + "grad_norm": 1.1615066528320312, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.632, + "step": 10757 + }, + { + "epoch": 0.15577889447236182, + "grad_norm": 1.2898380756378174, + "learning_rate": 3.843023702543556e-05, + "loss": 0.6332, + "step": 10788 + }, + { + "epoch": 0.15622653497371916, + "grad_norm": 1.0051672458648682, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.6353, + "step": 10819 + }, + { + "epoch": 0.15667417547507653, + "grad_norm": 0.9514272212982178, + "learning_rate": 3.828826522492255e-05, + "loss": 0.6383, + "step": 10850 + }, + { + "epoch": 0.1571218159764339, + "grad_norm": 1.2570873498916626, + "learning_rate": 3.821705398930713e-05, + "loss": 0.6308, + "step": 10881 + }, + { + "epoch": 0.15756945647779125, + "grad_norm": 0.992323637008667, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6303, + "step": 10912 + }, + { + "epoch": 0.15801709697914862, + "grad_norm": 1.255265474319458, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6349, + "step": 10943 + }, + { + "epoch": 0.158464737480506, + "grad_norm": 1.1066702604293823, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6317, + "step": 10974 + }, + { + "epoch": 0.15891237798186333, + "grad_norm": 1.178690791130066, + "learning_rate": 3.793072558155093e-05, + "loss": 0.639, + "step": 11005 + }, + { + "epoch": 0.1593600184832207, + "grad_norm": 1.0850341320037842, + "learning_rate": 3.785877665226426e-05, + "loss": 0.6375, + "step": 11036 + }, + { + "epoch": 0.15980765898457805, + "grad_norm": 1.1378651857376099, + "learning_rate": 3.778668261343079e-05, + "loss": 0.6287, + "step": 11067 + }, + { + "epoch": 0.16025529948593542, + "grad_norm": 1.07688570022583, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6261, + "step": 11098 + }, + { + "epoch": 0.1607029399872928, + "grad_norm": 1.108269453048706, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.6352, + "step": 11129 + }, + { + "epoch": 0.16115058048865014, + "grad_norm": 1.2582095861434937, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.1615982209900075, + "grad_norm": 0.9823578000068665, + "learning_rate": 3.749687165842753e-05, + "loss": 0.6253, + "step": 11191 + }, + { + "epoch": 0.16204586149136488, + "grad_norm": 1.3922805786132812, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.6279, + "step": 11222 + }, + { + "epoch": 0.16249350199272222, + "grad_norm": 1.2210962772369385, + "learning_rate": 3.735111675341645e-05, + "loss": 0.6357, + "step": 11253 + }, + { + "epoch": 0.1629411424940796, + "grad_norm": 1.0463316440582275, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.6332, + "step": 11284 + }, + { + "epoch": 0.16338878299543697, + "grad_norm": 1.165583848953247, + "learning_rate": 3.720480432728287e-05, + "loss": 0.627, + "step": 11315 + }, + { + "epoch": 0.1638364234967943, + "grad_norm": 1.0995306968688965, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6283, + "step": 11346 + }, + { + "epoch": 0.16428406399815168, + "grad_norm": 1.0279158353805542, + "learning_rate": 3.70579409844715e-05, + "loss": 0.6287, + "step": 11377 + }, + { + "epoch": 0.16473170449950905, + "grad_norm": 1.51092529296875, + "learning_rate": 3.698430479000865e-05, + "loss": 0.6261, + "step": 11408 + }, + { + "epoch": 0.1651793450008664, + "grad_norm": 1.020936369895935, + "learning_rate": 3.691053335429509e-05, + "loss": 0.6327, + "step": 11439 + }, + { + "epoch": 0.16562698550222377, + "grad_norm": 1.0198683738708496, + "learning_rate": 3.683662750983147e-05, + "loss": 0.6422, + "step": 11470 + }, + { + "epoch": 0.1660746260035811, + "grad_norm": 1.2650995254516602, + "learning_rate": 3.676258809063518e-05, + "loss": 0.6354, + "step": 11501 + }, + { + "epoch": 0.16652226650493848, + "grad_norm": 1.1653568744659424, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6325, + "step": 11532 + }, + { + "epoch": 0.16696990700629585, + "grad_norm": 1.1461430788040161, + "learning_rate": 3.661411187164166e-05, + "loss": 0.6251, + "step": 11563 + }, + { + "epoch": 0.1674175475076532, + "grad_norm": 1.2535974979400635, + "learning_rate": 3.65396767473784e-05, + "loss": 0.6245, + "step": 11594 + }, + { + "epoch": 0.16786518800901057, + "grad_norm": 1.115191102027893, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.6294, + "step": 11625 + }, + { + "epoch": 0.16831282851036794, + "grad_norm": 1.0482964515686035, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.6247, + "step": 11656 + }, + { + "epoch": 0.16876046901172528, + "grad_norm": 1.1431951522827148, + "learning_rate": 3.63155933997859e-05, + "loss": 0.63, + "step": 11687 + }, + { + "epoch": 0.16920810951308266, + "grad_norm": 1.0254175662994385, + "learning_rate": 3.624064243537758e-05, + "loss": 0.6212, + "step": 11718 + }, + { + "epoch": 0.16965575001444003, + "grad_norm": 0.9481080174446106, + "learning_rate": 3.616556462184716e-05, + "loss": 0.6278, + "step": 11749 + }, + { + "epoch": 0.17010339051579737, + "grad_norm": 1.118394374847412, + "learning_rate": 3.609036080643755e-05, + "loss": 0.6244, + "step": 11780 + }, + { + "epoch": 0.17055103101715474, + "grad_norm": 1.1592167615890503, + "learning_rate": 3.60150318378136e-05, + "loss": 0.621, + "step": 11811 + }, + { + "epoch": 0.1709986715185121, + "grad_norm": 0.9984686374664307, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.6319, + "step": 11842 + }, + { + "epoch": 0.17144631201986946, + "grad_norm": 1.0091164112091064, + "learning_rate": 3.586400184263408e-05, + "loss": 0.6345, + "step": 11873 + }, + { + "epoch": 0.17189395252122683, + "grad_norm": 1.0355888605117798, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6171, + "step": 11904 + }, + { + "epoch": 0.17234159302258417, + "grad_norm": 1.1437592506408691, + "learning_rate": 3.571248145370125e-05, + "loss": 0.6201, + "step": 11935 + }, + { + "epoch": 0.17278923352394154, + "grad_norm": 0.9440962672233582, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.6236, + "step": 11966 + }, + { + "epoch": 0.17323687402529891, + "grad_norm": 0.9761082530021667, + "learning_rate": 3.556047751054378e-05, + "loss": 0.6291, + "step": 11997 + }, + { + "epoch": 0.17368451452665626, + "grad_norm": 1.1858127117156982, + "learning_rate": 3.548429634946039e-05, + "loss": 0.6299, + "step": 12028 + }, + { + "epoch": 0.17413215502801363, + "grad_norm": 1.0180195569992065, + "learning_rate": 3.540799687451768e-05, + "loss": 0.6227, + "step": 12059 + }, + { + "epoch": 0.174579795529371, + "grad_norm": 0.9683852195739746, + "learning_rate": 3.533157994674485e-05, + "loss": 0.626, + "step": 12090 + }, + { + "epoch": 0.17502743603072834, + "grad_norm": 1.0338289737701416, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.6377, + "step": 12121 + }, + { + "epoch": 0.17547507653208572, + "grad_norm": 1.1238298416137695, + "learning_rate": 3.517839718344311e-05, + "loss": 0.6338, + "step": 12152 + }, + { + "epoch": 0.1759227170334431, + "grad_norm": 1.0541973114013672, + "learning_rate": 3.510163307656086e-05, + "loss": 0.6222, + "step": 12183 + }, + { + "epoch": 0.17637035753480043, + "grad_norm": 1.1677592992782593, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.6256, + "step": 12214 + }, + { + "epoch": 0.1768179980361578, + "grad_norm": 1.0191985368728638, + "learning_rate": 3.494776374368643e-05, + "loss": 0.6301, + "step": 12245 + }, + { + "epoch": 0.17726563853751517, + "grad_norm": 1.1535918712615967, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.6253, + "step": 12276 + }, + { + "epoch": 0.17771327903887252, + "grad_norm": 1.0887985229492188, + "learning_rate": 3.479344537543164e-05, + "loss": 0.6335, + "step": 12307 + }, + { + "epoch": 0.1781609195402299, + "grad_norm": 1.010688066482544, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.6251, + "step": 12338 + }, + { + "epoch": 0.17860856004158723, + "grad_norm": 0.9745127558708191, + "learning_rate": 3.463868493762412e-05, + "loss": 0.6241, + "step": 12369 + }, + { + "epoch": 0.1790562005429446, + "grad_norm": 1.0414716005325317, + "learning_rate": 3.456114112492418e-05, + "loss": 0.6237, + "step": 12400 + }, + { + "epoch": 0.17950384104430198, + "grad_norm": 1.0457465648651123, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.6258, + "step": 12431 + }, + { + "epoch": 0.17995148154565932, + "grad_norm": 1.0389049053192139, + "learning_rate": 3.440573068727905e-05, + "loss": 0.6262, + "step": 12462 + }, + { + "epoch": 0.1803991220470167, + "grad_norm": 1.255600094795227, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.6305, + "step": 12493 + }, + { + "epoch": 0.18084676254837406, + "grad_norm": 1.0340358018875122, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.6212, + "step": 12524 + }, + { + "epoch": 0.1812944030497314, + "grad_norm": 1.0317034721374512, + "learning_rate": 3.417182116258899e-05, + "loss": 0.6279, + "step": 12555 + }, + { + "epoch": 0.18174204355108878, + "grad_norm": 1.1320221424102783, + "learning_rate": 3.409364314116074e-05, + "loss": 0.631, + "step": 12586 + }, + { + "epoch": 0.18218968405244615, + "grad_norm": 0.9674787521362305, + "learning_rate": 3.401536249920559e-05, + "loss": 0.627, + "step": 12617 + }, + { + "epoch": 0.1826373245538035, + "grad_norm": 0.9329623579978943, + "learning_rate": 3.393698012010998e-05, + "loss": 0.6244, + "step": 12648 + }, + { + "epoch": 0.18308496505516086, + "grad_norm": 1.2081501483917236, + "learning_rate": 3.385849688840839e-05, + "loss": 0.6295, + "step": 12679 + }, + { + "epoch": 0.18353260555651824, + "grad_norm": 0.9842090010643005, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.6276, + "step": 12710 + }, + { + "epoch": 0.18398024605787558, + "grad_norm": 1.1417752504348755, + "learning_rate": 3.370123141100578e-05, + "loss": 0.6266, + "step": 12741 + }, + { + "epoch": 0.18442788655923295, + "grad_norm": 0.9693592190742493, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.6245, + "step": 12772 + }, + { + "epoch": 0.1848755270605903, + "grad_norm": 1.1920111179351807, + "learning_rate": 3.35435731658559e-05, + "loss": 0.6227, + "step": 12803 + }, + { + "epoch": 0.18532316756194767, + "grad_norm": 0.9865401387214661, + "learning_rate": 3.346459897862552e-05, + "loss": 0.6287, + "step": 12834 + }, + { + "epoch": 0.18577080806330504, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.338552926954613e-05, + "loss": 0.6236, + "step": 12865 + }, + { + "epoch": 0.18621844856466238, + "grad_norm": 1.0202548503875732, + "learning_rate": 3.330636493090868e-05, + "loss": 0.6269, + "step": 12896 + }, + { + "epoch": 0.18666608906601975, + "grad_norm": 1.1385433673858643, + "learning_rate": 3.322710685607193e-05, + "loss": 0.6385, + "step": 12927 + }, + { + "epoch": 0.18711372956737712, + "grad_norm": 1.0102901458740234, + "learning_rate": 3.314775593945251e-05, + "loss": 0.6241, + "step": 12958 + }, + { + "epoch": 0.18756137006873447, + "grad_norm": 0.9830989241600037, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.6243, + "step": 12989 + }, + { + "epoch": 0.18800901057009184, + "grad_norm": 1.0044376850128174, + "learning_rate": 3.298877916376047e-05, + "loss": 0.619, + "step": 13020 + }, + { + "epoch": 0.1884566510714492, + "grad_norm": 1.0714712142944336, + "learning_rate": 3.290915509871915e-05, + "loss": 0.6243, + "step": 13051 + }, + { + "epoch": 0.18890429157280655, + "grad_norm": 0.9379229545593262, + "learning_rate": 3.282944177993753e-05, + "loss": 0.6216, + "step": 13082 + }, + { + "epoch": 0.18935193207416393, + "grad_norm": 1.2717514038085938, + "learning_rate": 3.274964010696957e-05, + "loss": 0.6206, + "step": 13113 + }, + { + "epoch": 0.1897995725755213, + "grad_norm": 1.1147576570510864, + "learning_rate": 3.266975098036629e-05, + "loss": 0.6234, + "step": 13144 + }, + { + "epoch": 0.19024721307687864, + "grad_norm": 0.9994730949401855, + "learning_rate": 3.258977530166562e-05, + "loss": 0.6146, + "step": 13175 + }, + { + "epoch": 0.190694853578236, + "grad_norm": 1.195367693901062, + "learning_rate": 3.250971397338227e-05, + "loss": 0.624, + "step": 13206 + }, + { + "epoch": 0.19114249407959336, + "grad_norm": 1.0008747577667236, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.6182, + "step": 13237 + }, + { + "epoch": 0.19159013458095073, + "grad_norm": 1.3223299980163574, + "learning_rate": 3.234933798294859e-05, + "loss": 0.6193, + "step": 13268 + }, + { + "epoch": 0.1920377750823081, + "grad_norm": 1.1946437358856201, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.6201, + "step": 13299 + }, + { + "epoch": 0.19248541558366544, + "grad_norm": 1.1597986221313477, + "learning_rate": 3.218863024832985e-05, + "loss": 0.6212, + "step": 13330 + }, + { + "epoch": 0.1929330560850228, + "grad_norm": 0.9518936276435852, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.6154, + "step": 13361 + }, + { + "epoch": 0.19338069658638019, + "grad_norm": 0.890487790107727, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.6203, + "step": 13392 + }, + { + "epoch": 0.19382833708773753, + "grad_norm": 0.9918534755706787, + "learning_rate": 3.194696249871729e-05, + "loss": 0.6319, + "step": 13423 + }, + { + "epoch": 0.1942759775890949, + "grad_norm": 1.1954073905944824, + "learning_rate": 3.186624857814164e-05, + "loss": 0.619, + "step": 13454 + }, + { + "epoch": 0.19472361809045227, + "grad_norm": 1.1521157026290894, + "learning_rate": 3.178545717288401e-05, + "loss": 0.6326, + "step": 13485 + }, + { + "epoch": 0.19517125859180962, + "grad_norm": 1.0131208896636963, + "learning_rate": 3.170458919466444e-05, + "loss": 0.6234, + "step": 13516 + }, + { + "epoch": 0.195618899093167, + "grad_norm": 1.0429494380950928, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.6146, + "step": 13547 + }, + { + "epoch": 0.19606653959452436, + "grad_norm": 0.9586461782455444, + "learning_rate": 3.154262717052985e-05, + "loss": 0.6192, + "step": 13578 + }, + { + "epoch": 0.1965141800958817, + "grad_norm": 0.9385515451431274, + "learning_rate": 3.146153495233426e-05, + "loss": 0.6186, + "step": 13609 + }, + { + "epoch": 0.19696182059723907, + "grad_norm": 0.9109722375869751, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.6223, + "step": 13640 + }, + { + "epoch": 0.19740946109859642, + "grad_norm": 1.0564444065093994, + "learning_rate": 3.129913267924946e-05, + "loss": 0.6235, + "step": 13671 + }, + { + "epoch": 0.1978571015999538, + "grad_norm": 1.1656286716461182, + "learning_rate": 3.121782445704782e-05, + "loss": 0.6176, + "step": 13702 + }, + { + "epoch": 0.19830474210131116, + "grad_norm": 1.1301069259643555, + "learning_rate": 3.11364460675423e-05, + "loss": 0.6253, + "step": 13733 + }, + { + "epoch": 0.1987523826026685, + "grad_norm": 0.9939395785331726, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.6223, + "step": 13764 + }, + { + "epoch": 0.19920002310402588, + "grad_norm": 1.2881885766983032, + "learning_rate": 3.097348246077728e-05, + "loss": 0.6177, + "step": 13795 + }, + { + "epoch": 0.19964766360538325, + "grad_norm": 1.1002579927444458, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.6139, + "step": 13826 + }, + { + "epoch": 0.2000953041067406, + "grad_norm": 1.045394778251648, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.6192, + "step": 13857 + }, + { + "epoch": 0.20054294460809796, + "grad_norm": 0.9559116959571838, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.6155, + "step": 13888 + }, + { + "epoch": 0.20099058510945533, + "grad_norm": 0.9250887036323547, + "learning_rate": 3.064675369851637e-05, + "loss": 0.6235, + "step": 13919 + }, + { + "epoch": 0.20143822561081268, + "grad_norm": 1.0655368566513062, + "learning_rate": 3.056490989455289e-05, + "loss": 0.628, + "step": 13950 + }, + { + "epoch": 0.20188586611217005, + "grad_norm": 1.07636559009552, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.6244, + "step": 13981 + }, + { + "epoch": 0.20233350661352742, + "grad_norm": 1.050580620765686, + "learning_rate": 3.040103481317539e-05, + "loss": 0.6222, + "step": 14012 + }, + { + "epoch": 0.20278114711488476, + "grad_norm": 1.3754404783248901, + "learning_rate": 3.03190053850694e-05, + "loss": 0.6151, + "step": 14043 + }, + { + "epoch": 0.20322878761624213, + "grad_norm": 1.0527547597885132, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.6153, + "step": 14074 + }, + { + "epoch": 0.20367642811759948, + "grad_norm": 0.9438226819038391, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.618, + "step": 14105 + }, + { + "epoch": 0.20412406861895685, + "grad_norm": 1.0383126735687256, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.6162, + "step": 14136 + }, + { + "epoch": 0.20457170912031422, + "grad_norm": 1.1412239074707031, + "learning_rate": 2.999029669712431e-05, + "loss": 0.6284, + "step": 14167 + }, + { + "epoch": 0.20501934962167156, + "grad_norm": 1.1064159870147705, + "learning_rate": 2.990797641805408e-05, + "loss": 0.6223, + "step": 14198 + }, + { + "epoch": 0.20546699012302894, + "grad_norm": 1.0044069290161133, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6191, + "step": 14229 + }, + { + "epoch": 0.2059146306243863, + "grad_norm": 0.9315604567527771, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.6207, + "step": 14260 + }, + { + "epoch": 0.20636227112574365, + "grad_norm": 0.941224217414856, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.6207, + "step": 14291 + }, + { + "epoch": 0.20680991162710102, + "grad_norm": 1.1239089965820312, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.6252, + "step": 14322 + }, + { + "epoch": 0.2072575521284584, + "grad_norm": 0.9484926462173462, + "learning_rate": 2.949556283611942e-05, + "loss": 0.6136, + "step": 14353 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.9437084197998047, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.6154, + "step": 14384 + }, + { + "epoch": 0.2081528331311731, + "grad_norm": 0.9578093886375427, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.6191, + "step": 14415 + }, + { + "epoch": 0.20860047363253048, + "grad_norm": 1.0657248497009277, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.6178, + "step": 14446 + }, + { + "epoch": 0.20904811413388782, + "grad_norm": 0.853568434715271, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6152, + "step": 14477 + }, + { + "epoch": 0.2094957546352452, + "grad_norm": 1.0403015613555908, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.6108, + "step": 14508 + }, + { + "epoch": 0.20994339513660254, + "grad_norm": 1.0344171524047852, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.6155, + "step": 14539 + }, + { + "epoch": 0.2103910356379599, + "grad_norm": 1.0755060911178589, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.6156, + "step": 14570 + }, + { + "epoch": 0.21083867613931728, + "grad_norm": 0.8636776208877563, + "learning_rate": 2.883311164593017e-05, + "loss": 0.6193, + "step": 14601 + }, + { + "epoch": 0.21128631664067463, + "grad_norm": 1.0264644622802734, + "learning_rate": 2.875010077160754e-05, + "loss": 0.6138, + "step": 14632 + }, + { + "epoch": 0.211733957142032, + "grad_norm": 1.2590196132659912, + "learning_rate": 2.866704757790741e-05, + "loss": 0.6202, + "step": 14663 + }, + { + "epoch": 0.21218159764338937, + "grad_norm": 1.1028645038604736, + "learning_rate": 2.858395300207376e-05, + "loss": 0.614, + "step": 14694 + }, + { + "epoch": 0.2126292381447467, + "grad_norm": 0.8904405236244202, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.6152, + "step": 14725 + }, + { + "epoch": 0.21307687864610408, + "grad_norm": 0.9810163974761963, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.6088, + "step": 14756 + }, + { + "epoch": 0.21352451914746146, + "grad_norm": 0.9837898015975952, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.6129, + "step": 14787 + }, + { + "epoch": 0.2139721596488188, + "grad_norm": 0.987639844417572, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.6081, + "step": 14818 + }, + { + "epoch": 0.21441980015017617, + "grad_norm": 1.1478586196899414, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.6146, + "step": 14849 + }, + { + "epoch": 0.21486744065153354, + "grad_norm": 1.0885242223739624, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.6183, + "step": 14880 + }, + { + "epoch": 0.21531508115289089, + "grad_norm": 0.9934699535369873, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.6157, + "step": 14911 + }, + { + "epoch": 0.21576272165424826, + "grad_norm": 0.9285492300987244, + "learning_rate": 2.791781925709473e-05, + "loss": 0.6196, + "step": 14942 + }, + { + "epoch": 0.2162103621556056, + "grad_norm": 1.243133783340454, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.6109, + "step": 14973 + }, + { + "epoch": 0.21665800265696297, + "grad_norm": 1.0712559223175049, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.6268, + "step": 15004 + }, + { + "epoch": 0.21710564315832034, + "grad_norm": 1.0762903690338135, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.6162, + "step": 15035 + }, + { + "epoch": 0.2175532836596777, + "grad_norm": 1.043479084968567, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6174, + "step": 15066 + }, + { + "epoch": 0.21800092416103506, + "grad_norm": 0.9906991720199585, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.6153, + "step": 15097 + }, + { + "epoch": 0.21844856466239243, + "grad_norm": 0.8844815492630005, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.6113, + "step": 15128 + }, + { + "epoch": 0.21889620516374977, + "grad_norm": 1.0258604288101196, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.6111, + "step": 15159 + }, + { + "epoch": 0.21934384566510715, + "grad_norm": 0.8992047309875488, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6083, + "step": 15190 + }, + { + "epoch": 0.21979148616646452, + "grad_norm": 0.991061806678772, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.6219, + "step": 15221 + }, + { + "epoch": 0.22023912666782186, + "grad_norm": 0.9700108766555786, + "learning_rate": 2.708224532974953e-05, + "loss": 0.6119, + "step": 15252 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 7630, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1258280394232955e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15260/training_args.bin b/checkpoint-15260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542 --- /dev/null +++ b/checkpoint-15260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3 +size 5432 diff --git a/checkpoint-22890/config.json b/checkpoint-22890/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09 --- /dev/null +++ b/checkpoint-22890/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.47.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-22890/generation_config.json b/checkpoint-22890/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507 --- /dev/null +++ b/checkpoint-22890/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.47.0.dev0" +} diff --git a/checkpoint-22890/model-00001-of-00007.safetensors b/checkpoint-22890/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..481f5f0eceeecb729de47911bc850cda733bf744 --- /dev/null +++ b/checkpoint-22890/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a65a75b982671f6fe429d8b4f2a643bc8cc123b81a8b3e23954d5f9dd4dfe741 +size 4886466168 diff --git a/checkpoint-22890/model-00002-of-00007.safetensors b/checkpoint-22890/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961 --- /dev/null +++ b/checkpoint-22890/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64 +size 4832007448 diff --git a/checkpoint-22890/model-00003-of-00007.safetensors b/checkpoint-22890/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff --- /dev/null +++ b/checkpoint-22890/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97 +size 4999813112 diff --git a/checkpoint-22890/model-00004-of-00007.safetensors b/checkpoint-22890/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a --- /dev/null +++ b/checkpoint-22890/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042 +size 4999813128 diff --git a/checkpoint-22890/model-00005-of-00007.safetensors b/checkpoint-22890/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89 --- /dev/null +++ b/checkpoint-22890/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7 +size 4832007496 diff --git a/checkpoint-22890/model-00006-of-00007.safetensors b/checkpoint-22890/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6857e095b693a515a6b452b2978d8ec6beb1914d --- /dev/null +++ b/checkpoint-22890/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1428de016ad23aaf1f31a13925e31fb9295785a7a8a83467da982e10e32ab7f1 +size 4999813120 diff --git a/checkpoint-22890/model-00007-of-00007.safetensors b/checkpoint-22890/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..28d49af3d02ee8b841af095b31254b0104cf488e --- /dev/null +++ b/checkpoint-22890/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b44e4ad77bf2ef84e92e62da86ec1c56c5090c09e1194074f9b844107d09805 +size 2571158184 diff --git a/checkpoint-22890/model.safetensors.index.json b/checkpoint-22890/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-22890/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-22890/optimizer.pt b/checkpoint-22890/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..debe2927cf1c50506929531bbec61e776094d63c --- /dev/null +++ b/checkpoint-22890/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7f0e296306f8b81a363f3e59074fa525d4d241d146378e78b07b98214f8e1d +size 15385036334 diff --git a/checkpoint-22890/rng_state.pth b/checkpoint-22890/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c --- /dev/null +++ b/checkpoint-22890/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69 +size 14244 diff --git a/checkpoint-22890/scheduler.pt b/checkpoint-22890/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..52df82b086a98dd5b315cd1cef2e8d75fddc67aa --- /dev/null +++ b/checkpoint-22890/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cceae98d85860224f83926a9c2d0797a9f4411abc8a933e66f23226e9ba794a +size 1064 diff --git a/checkpoint-22890/trainer_state.json b/checkpoint-22890/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d09807aa8783e017c749e1097c4302742da03a --- /dev/null +++ b/checkpoint-22890/trainer_state.json @@ -0,0 +1,5199 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3305319701958066, + "eval_steps": 500, + "global_step": 22890, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004476405013573615, + "grad_norm": 4.6696085929870605, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9366, + "step": 31 + }, + { + "epoch": 0.000895281002714723, + "grad_norm": 4.250915050506592, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.9002, + "step": 62 + }, + { + "epoch": 0.0013429215040720846, + "grad_norm": 4.424270153045654, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8843, + "step": 93 + }, + { + "epoch": 0.001790562005429446, + "grad_norm": 4.56964635848999, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8717, + "step": 124 + }, + { + "epoch": 0.0022382025067868077, + "grad_norm": 4.051624298095703, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8711, + "step": 155 + }, + { + "epoch": 0.002685843008144169, + "grad_norm": 3.98006272315979, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8628, + "step": 186 + }, + { + "epoch": 0.0031334835095015307, + "grad_norm": 4.4158406257629395, + "learning_rate": 7.110091743119267e-06, + "loss": 0.871, + "step": 217 + }, + { + "epoch": 0.003581124010858892, + "grad_norm": 4.681333541870117, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8593, + "step": 248 + }, + { + "epoch": 0.004028764512216254, + "grad_norm": 3.8057820796966553, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8558, + "step": 279 + }, + { + "epoch": 0.0044764050135736155, + "grad_norm": 4.523633003234863, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8676, + "step": 310 + }, + { + "epoch": 0.0049240455149309765, + "grad_norm": 3.7387187480926514, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8585, + "step": 341 + }, + { + "epoch": 0.005371686016288338, + "grad_norm": 4.187750816345215, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8592, + "step": 372 + }, + { + "epoch": 0.005819326517645699, + "grad_norm": 3.782883644104004, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8449, + "step": 403 + }, + { + "epoch": 0.006266967019003061, + "grad_norm": 3.577796459197998, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8418, + "step": 434 + }, + { + "epoch": 0.006714607520360423, + "grad_norm": 3.1408321857452393, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8577, + "step": 465 + }, + { + "epoch": 0.007162248021717784, + "grad_norm": 4.090081691741943, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8439, + "step": 496 + }, + { + "epoch": 0.007609888523075146, + "grad_norm": 2.7458200454711914, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8468, + "step": 527 + }, + { + "epoch": 0.008057529024432507, + "grad_norm": 3.703225612640381, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8385, + "step": 558 + }, + { + "epoch": 0.008505169525789868, + "grad_norm": 3.134650230407715, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8418, + "step": 589 + }, + { + "epoch": 0.008952810027147231, + "grad_norm": 3.762680768966675, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8312, + "step": 620 + }, + { + "epoch": 0.009400450528504592, + "grad_norm": 3.751004457473755, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8251, + "step": 651 + }, + { + "epoch": 0.009848091029861953, + "grad_norm": 3.2268712520599365, + "learning_rate": 2.234600262123198e-05, + "loss": 0.8369, + "step": 682 + }, + { + "epoch": 0.010295731531219316, + "grad_norm": 3.5854289531707764, + "learning_rate": 2.336173001310616e-05, + "loss": 0.826, + "step": 713 + }, + { + "epoch": 0.010743372032576677, + "grad_norm": 3.9910435676574707, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8168, + "step": 744 + }, + { + "epoch": 0.011191012533934038, + "grad_norm": 3.3059303760528564, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.011638653035291399, + "grad_norm": 3.4081811904907227, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.817, + "step": 806 + }, + { + "epoch": 0.012086293536648762, + "grad_norm": 3.2740163803100586, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8195, + "step": 837 + }, + { + "epoch": 0.012533934038006123, + "grad_norm": 2.7206223011016846, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8188, + "step": 868 + }, + { + "epoch": 0.012981574539363484, + "grad_norm": 2.7005629539489746, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8127, + "step": 899 + }, + { + "epoch": 0.013429215040720846, + "grad_norm": 2.970745325088501, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8126, + "step": 930 + }, + { + "epoch": 0.013876855542078207, + "grad_norm": 2.4761953353881836, + "learning_rate": 3.148754914809961e-05, + "loss": 0.82, + "step": 961 + }, + { + "epoch": 0.014324496043435568, + "grad_norm": 2.8555397987365723, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8166, + "step": 992 + }, + { + "epoch": 0.01477213654479293, + "grad_norm": 2.8124194145202637, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8057, + "step": 1023 + }, + { + "epoch": 0.015219777046150292, + "grad_norm": 2.353851556777954, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8064, + "step": 1054 + }, + { + "epoch": 0.015667417547507653, + "grad_norm": 3.0127620697021484, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8086, + "step": 1085 + }, + { + "epoch": 0.016115058048865014, + "grad_norm": 2.792686939239502, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.8152, + "step": 1116 + }, + { + "epoch": 0.016562698550222375, + "grad_norm": 2.407134532928467, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7949, + "step": 1147 + }, + { + "epoch": 0.017010339051579736, + "grad_norm": 2.6921393871307373, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.804, + "step": 1178 + }, + { + "epoch": 0.0174579795529371, + "grad_norm": 2.3015975952148438, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7944, + "step": 1209 + }, + { + "epoch": 0.017905620054294462, + "grad_norm": 2.8116579055786133, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7977, + "step": 1240 + }, + { + "epoch": 0.018353260555651823, + "grad_norm": 2.5720036029815674, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7854, + "step": 1271 + }, + { + "epoch": 0.018800901057009184, + "grad_norm": 2.0802650451660156, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7892, + "step": 1302 + }, + { + "epoch": 0.019248541558366545, + "grad_norm": 2.4343624114990234, + "learning_rate": 4.367627785058978e-05, + "loss": 0.7897, + "step": 1333 + }, + { + "epoch": 0.019696182059723906, + "grad_norm": 2.509686231613159, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7855, + "step": 1364 + }, + { + "epoch": 0.020143822561081267, + "grad_norm": 2.626512289047241, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7873, + "step": 1395 + }, + { + "epoch": 0.02059146306243863, + "grad_norm": 2.8619399070739746, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7891, + "step": 1426 + }, + { + "epoch": 0.021039103563795993, + "grad_norm": 2.724792718887329, + "learning_rate": 4.77391874180865e-05, + "loss": 0.782, + "step": 1457 + }, + { + "epoch": 0.021486744065153354, + "grad_norm": 2.6659562587738037, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7856, + "step": 1488 + }, + { + "epoch": 0.021934384566510715, + "grad_norm": 2.646078586578369, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7748, + "step": 1519 + }, + { + "epoch": 0.022382025067868076, + "grad_norm": 2.429288387298584, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7722, + "step": 1550 + }, + { + "epoch": 0.022829665569225437, + "grad_norm": 1.9933409690856934, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7874, + "step": 1581 + }, + { + "epoch": 0.023277306070582798, + "grad_norm": 2.314889907836914, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7757, + "step": 1612 + }, + { + "epoch": 0.023724946571940162, + "grad_norm": 2.2891199588775635, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7713, + "step": 1643 + }, + { + "epoch": 0.024172587073297523, + "grad_norm": 2.4892444610595703, + "learning_rate": 4.999678487776908e-05, + "loss": 0.7687, + "step": 1674 + }, + { + "epoch": 0.024620227574654884, + "grad_norm": 2.3015685081481934, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7721, + "step": 1705 + }, + { + "epoch": 0.025067868076012245, + "grad_norm": 2.278954029083252, + "learning_rate": 4.999352703566763e-05, + "loss": 0.7741, + "step": 1736 + }, + { + "epoch": 0.025515508577369606, + "grad_norm": 1.7260370254516602, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7681, + "step": 1767 + }, + { + "epoch": 0.025963149078726967, + "grad_norm": 2.0179309844970703, + "learning_rate": 4.998914100252672e-05, + "loss": 0.7604, + "step": 1798 + }, + { + "epoch": 0.02641078958008433, + "grad_norm": 2.53022837638855, + "learning_rate": 4.998652497419696e-05, + "loss": 0.7598, + "step": 1829 + }, + { + "epoch": 0.026858430081441693, + "grad_norm": 1.859253168106079, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7606, + "step": 1860 + }, + { + "epoch": 0.027306070582799054, + "grad_norm": 1.759303331375122, + "learning_rate": 4.998044704162613e-05, + "loss": 0.7532, + "step": 1891 + }, + { + "epoch": 0.027753711084156415, + "grad_norm": 2.4389419555664062, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.7646, + "step": 1922 + }, + { + "epoch": 0.028201351585513776, + "grad_norm": 2.155348777770996, + "learning_rate": 4.997324150843799e-05, + "loss": 0.7569, + "step": 1953 + }, + { + "epoch": 0.028648992086871137, + "grad_norm": 2.0138537883758545, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7677, + "step": 1984 + }, + { + "epoch": 0.029096632588228498, + "grad_norm": 2.5275282859802246, + "learning_rate": 4.996490869988546e-05, + "loss": 0.7519, + "step": 2015 + }, + { + "epoch": 0.02954427308958586, + "grad_norm": 1.8147333860397339, + "learning_rate": 4.996031968290326e-05, + "loss": 0.7509, + "step": 2046 + }, + { + "epoch": 0.029991913590943223, + "grad_norm": 2.1941769123077393, + "learning_rate": 4.995544899210594e-05, + "loss": 0.754, + "step": 2077 + }, + { + "epoch": 0.030439554092300584, + "grad_norm": 1.8953059911727905, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.747, + "step": 2108 + }, + { + "epoch": 0.030887194593657945, + "grad_norm": 3.3973031044006348, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7513, + "step": 2139 + }, + { + "epoch": 0.031334835095015307, + "grad_norm": 2.66795015335083, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.7469, + "step": 2170 + }, + { + "epoch": 0.03178247559637267, + "grad_norm": 1.6254230737686157, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.7297, + "step": 2201 + }, + { + "epoch": 0.03223011609773003, + "grad_norm": 1.822745680809021, + "learning_rate": 4.992687246588743e-05, + "loss": 0.754, + "step": 2232 + }, + { + "epoch": 0.03267775659908739, + "grad_norm": 1.6898781061172485, + "learning_rate": 4.992031299767347e-05, + "loss": 0.7478, + "step": 2263 + }, + { + "epoch": 0.03312539710044475, + "grad_norm": 1.799280047416687, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.746, + "step": 2294 + }, + { + "epoch": 0.033573037601802115, + "grad_norm": 2.2501840591430664, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.7408, + "step": 2325 + }, + { + "epoch": 0.03402067810315947, + "grad_norm": 2.3315324783325195, + "learning_rate": 4.989894757091861e-05, + "loss": 0.7301, + "step": 2356 + }, + { + "epoch": 0.03446831860451684, + "grad_norm": 1.5820438861846924, + "learning_rate": 4.989126368919158e-05, + "loss": 0.7305, + "step": 2387 + }, + { + "epoch": 0.0349159591058742, + "grad_norm": 2.5696022510528564, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7337, + "step": 2418 + }, + { + "epoch": 0.03536359960723156, + "grad_norm": 1.8880938291549683, + "learning_rate": 4.987505333203608e-05, + "loss": 0.7385, + "step": 2449 + }, + { + "epoch": 0.035811240108588924, + "grad_norm": 2.6148738861083984, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.7292, + "step": 2480 + }, + { + "epoch": 0.03625888060994628, + "grad_norm": 1.6925290822982788, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.7344, + "step": 2511 + }, + { + "epoch": 0.036706521111303646, + "grad_norm": 1.7675210237503052, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.7354, + "step": 2542 + }, + { + "epoch": 0.037154161612661, + "grad_norm": 2.1053173542022705, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.7272, + "step": 2573 + }, + { + "epoch": 0.03760180211401837, + "grad_norm": 1.9718347787857056, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.7336, + "step": 2604 + }, + { + "epoch": 0.03804944261537573, + "grad_norm": 1.5777671337127686, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7182, + "step": 2635 + }, + { + "epoch": 0.03849708311673309, + "grad_norm": 1.905127763748169, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7296, + "step": 2666 + }, + { + "epoch": 0.038944723618090454, + "grad_norm": 1.63962721824646, + "learning_rate": 4.979899154855234e-05, + "loss": 0.7249, + "step": 2697 + }, + { + "epoch": 0.03939236411944781, + "grad_norm": 1.584331750869751, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.7345, + "step": 2728 + }, + { + "epoch": 0.039840004620805176, + "grad_norm": 1.9111014604568481, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7225, + "step": 2759 + }, + { + "epoch": 0.040287645122162534, + "grad_norm": 2.3216073513031006, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7207, + "step": 2790 + }, + { + "epoch": 0.0407352856235199, + "grad_norm": 1.6002410650253296, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7228, + "step": 2821 + }, + { + "epoch": 0.04118292612487726, + "grad_norm": 2.104731798171997, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7127, + "step": 2852 + }, + { + "epoch": 0.04163056662623462, + "grad_norm": 1.7114660739898682, + "learning_rate": 4.973018858007122e-05, + "loss": 0.7283, + "step": 2883 + }, + { + "epoch": 0.042078207127591985, + "grad_norm": 1.948133945465088, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7209, + "step": 2914 + }, + { + "epoch": 0.04252584762894934, + "grad_norm": 1.621764898300171, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7272, + "step": 2945 + }, + { + "epoch": 0.04297348813030671, + "grad_norm": 1.6967558860778809, + "learning_rate": 4.969201777632205e-05, + "loss": 0.7191, + "step": 2976 + }, + { + "epoch": 0.043421128631664065, + "grad_norm": 1.6656996011734009, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7205, + "step": 3007 + }, + { + "epoch": 0.04386876913302143, + "grad_norm": 2.151475191116333, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7168, + "step": 3038 + }, + { + "epoch": 0.044316409634378794, + "grad_norm": 2.213109016418457, + "learning_rate": 4.965133917685858e-05, + "loss": 0.7139, + "step": 3069 + }, + { + "epoch": 0.04476405013573615, + "grad_norm": 1.5380377769470215, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7237, + "step": 3100 + }, + { + "epoch": 0.045211690637093516, + "grad_norm": 2.312377452850342, + "learning_rate": 4.962282892045718e-05, + "loss": 0.7156, + "step": 3131 + }, + { + "epoch": 0.04565933113845087, + "grad_norm": 1.7220717668533325, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.7122, + "step": 3162 + }, + { + "epoch": 0.04610697163980824, + "grad_norm": 1.802856206893921, + "learning_rate": 4.959320720608049e-05, + "loss": 0.7128, + "step": 3193 + }, + { + "epoch": 0.046554612141165595, + "grad_norm": 1.6629964113235474, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.7172, + "step": 3224 + }, + { + "epoch": 0.04700225264252296, + "grad_norm": 3.440115213394165, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7213, + "step": 3255 + }, + { + "epoch": 0.047449893143880324, + "grad_norm": 1.5721139907836914, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7148, + "step": 3286 + }, + { + "epoch": 0.04789753364523768, + "grad_norm": 2.0920398235321045, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7145, + "step": 3317 + }, + { + "epoch": 0.048345174146595046, + "grad_norm": 1.666566014289856, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7095, + "step": 3348 + }, + { + "epoch": 0.048792814647952404, + "grad_norm": 1.8222129344940186, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7138, + "step": 3379 + }, + { + "epoch": 0.04924045514930977, + "grad_norm": 1.7302964925765991, + "learning_rate": 4.948079823064559e-05, + "loss": 0.7017, + "step": 3410 + }, + { + "epoch": 0.049688095650667126, + "grad_norm": 1.7338463068008423, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6979, + "step": 3441 + }, + { + "epoch": 0.05013573615202449, + "grad_norm": 1.5637450218200684, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7011, + "step": 3472 + }, + { + "epoch": 0.050583376653381855, + "grad_norm": 1.5632222890853882, + "learning_rate": 4.942847531574167e-05, + "loss": 0.704, + "step": 3503 + }, + { + "epoch": 0.05103101715473921, + "grad_norm": 1.588402509689331, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7011, + "step": 3534 + }, + { + "epoch": 0.05147865765609658, + "grad_norm": 1.8840582370758057, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7016, + "step": 3565 + }, + { + "epoch": 0.051926298157453935, + "grad_norm": 1.2702268362045288, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7004, + "step": 3596 + }, + { + "epoch": 0.0523739386588113, + "grad_norm": 1.3812692165374756, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7046, + "step": 3627 + }, + { + "epoch": 0.05282157916016866, + "grad_norm": 1.7257345914840698, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6976, + "step": 3658 + }, + { + "epoch": 0.05326921966152602, + "grad_norm": 1.7458925247192383, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6997, + "step": 3689 + }, + { + "epoch": 0.053716860162883386, + "grad_norm": 2.1996099948883057, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6991, + "step": 3720 + }, + { + "epoch": 0.05416450066424074, + "grad_norm": 1.6615021228790283, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7005, + "step": 3751 + }, + { + "epoch": 0.05461214116559811, + "grad_norm": 1.6276952028274536, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7097, + "step": 3782 + }, + { + "epoch": 0.055059781666955465, + "grad_norm": 1.758227825164795, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6955, + "step": 3813 + }, + { + "epoch": 0.05550742216831283, + "grad_norm": 1.195280909538269, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7073, + "step": 3844 + }, + { + "epoch": 0.05595506266967019, + "grad_norm": 1.6281015872955322, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7022, + "step": 3875 + }, + { + "epoch": 0.05640270317102755, + "grad_norm": 1.3543150424957275, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7036, + "step": 3906 + }, + { + "epoch": 0.056850343672384916, + "grad_norm": 2.16947078704834, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7007, + "step": 3937 + }, + { + "epoch": 0.057297984173742274, + "grad_norm": 1.324578881263733, + "learning_rate": 4.912976038673786e-05, + "loss": 0.6941, + "step": 3968 + }, + { + "epoch": 0.05774562467509964, + "grad_norm": 1.9811108112335205, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6908, + "step": 3999 + }, + { + "epoch": 0.058193265176456996, + "grad_norm": 1.2975554466247559, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6936, + "step": 4030 + }, + { + "epoch": 0.05864090567781436, + "grad_norm": 1.583282232284546, + "learning_rate": 4.906263980464644e-05, + "loss": 0.698, + "step": 4061 + }, + { + "epoch": 0.05908854617917172, + "grad_norm": 1.3532944917678833, + "learning_rate": 4.903972285033178e-05, + "loss": 0.7049, + "step": 4092 + }, + { + "epoch": 0.05953618668052908, + "grad_norm": 2.1245481967926025, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7016, + "step": 4123 + }, + { + "epoch": 0.05998382718188645, + "grad_norm": 1.6913797855377197, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6981, + "step": 4154 + }, + { + "epoch": 0.060431467683243804, + "grad_norm": 1.51249098777771, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6955, + "step": 4185 + }, + { + "epoch": 0.06087910818460117, + "grad_norm": 1.3880395889282227, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6934, + "step": 4216 + }, + { + "epoch": 0.061326748685958526, + "grad_norm": 1.6354159116744995, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6938, + "step": 4247 + }, + { + "epoch": 0.06177438918731589, + "grad_norm": 2.126742362976074, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7003, + "step": 4278 + }, + { + "epoch": 0.06222202968867325, + "grad_norm": 1.7903707027435303, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6885, + "step": 4309 + }, + { + "epoch": 0.06266967019003061, + "grad_norm": 1.537806510925293, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6892, + "step": 4340 + }, + { + "epoch": 0.06311731069138797, + "grad_norm": 1.6445434093475342, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6843, + "step": 4371 + }, + { + "epoch": 0.06356495119274534, + "grad_norm": 1.555373191833496, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6899, + "step": 4402 + }, + { + "epoch": 0.0640125916941027, + "grad_norm": 1.8370277881622314, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6813, + "step": 4433 + }, + { + "epoch": 0.06446023219546006, + "grad_norm": 1.3132514953613281, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6832, + "step": 4464 + }, + { + "epoch": 0.06490787269681741, + "grad_norm": 1.3186298608779907, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6879, + "step": 4495 + }, + { + "epoch": 0.06535551319817479, + "grad_norm": 1.4360268115997314, + "learning_rate": 4.869052379269719e-05, + "loss": 0.69, + "step": 4526 + }, + { + "epoch": 0.06580315369953214, + "grad_norm": 1.670765995979309, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6865, + "step": 4557 + }, + { + "epoch": 0.0662507942008895, + "grad_norm": 1.7548723220825195, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6852, + "step": 4588 + }, + { + "epoch": 0.06669843470224687, + "grad_norm": 1.5086426734924316, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6894, + "step": 4619 + }, + { + "epoch": 0.06714607520360423, + "grad_norm": 1.3140665292739868, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6812, + "step": 4650 + }, + { + "epoch": 0.06759371570496159, + "grad_norm": 1.4212454557418823, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6872, + "step": 4681 + }, + { + "epoch": 0.06804135620631895, + "grad_norm": 1.3034414052963257, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6901, + "step": 4712 + }, + { + "epoch": 0.06848899670767632, + "grad_norm": 1.3741438388824463, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6875, + "step": 4743 + }, + { + "epoch": 0.06893663720903367, + "grad_norm": 1.7262542247772217, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6868, + "step": 4774 + }, + { + "epoch": 0.06938427771039103, + "grad_norm": 1.3293650150299072, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6891, + "step": 4805 + }, + { + "epoch": 0.0698319182117484, + "grad_norm": 1.3448151350021362, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6765, + "step": 4836 + }, + { + "epoch": 0.07027955871310576, + "grad_norm": 2.961280584335327, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6939, + "step": 4867 + }, + { + "epoch": 0.07072719921446312, + "grad_norm": 1.8265361785888672, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.677, + "step": 4898 + }, + { + "epoch": 0.07117483971582048, + "grad_norm": 1.490349531173706, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6778, + "step": 4929 + }, + { + "epoch": 0.07162248021717785, + "grad_norm": 1.3669307231903076, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6746, + "step": 4960 + }, + { + "epoch": 0.0720701207185352, + "grad_norm": 1.3995884656906128, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6757, + "step": 4991 + }, + { + "epoch": 0.07251776121989256, + "grad_norm": 1.1217372417449951, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6832, + "step": 5022 + }, + { + "epoch": 0.07296540172124993, + "grad_norm": 1.2192097902297974, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6868, + "step": 5053 + }, + { + "epoch": 0.07341304222260729, + "grad_norm": 1.5045067071914673, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6793, + "step": 5084 + }, + { + "epoch": 0.07386068272396465, + "grad_norm": 1.7127234935760498, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6797, + "step": 5115 + }, + { + "epoch": 0.074308323225322, + "grad_norm": 1.846561312675476, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6767, + "step": 5146 + }, + { + "epoch": 0.07475596372667938, + "grad_norm": 1.3076797723770142, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6855, + "step": 5177 + }, + { + "epoch": 0.07520360422803674, + "grad_norm": 1.4170383214950562, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.681, + "step": 5208 + }, + { + "epoch": 0.0756512447293941, + "grad_norm": 1.2504942417144775, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6778, + "step": 5239 + }, + { + "epoch": 0.07609888523075146, + "grad_norm": 1.1522283554077148, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6749, + "step": 5270 + }, + { + "epoch": 0.07654652573210882, + "grad_norm": 1.6351525783538818, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6745, + "step": 5301 + }, + { + "epoch": 0.07699416623346618, + "grad_norm": 1.5093014240264893, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6747, + "step": 5332 + }, + { + "epoch": 0.07744180673482354, + "grad_norm": 1.362160563468933, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.673, + "step": 5363 + }, + { + "epoch": 0.07788944723618091, + "grad_norm": 1.2021727561950684, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6768, + "step": 5394 + }, + { + "epoch": 0.07833708773753827, + "grad_norm": 2.1543540954589844, + "learning_rate": 4.780153554146274e-05, + "loss": 0.672, + "step": 5425 + }, + { + "epoch": 0.07878472823889562, + "grad_norm": 1.882712721824646, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6926, + "step": 5456 + }, + { + "epoch": 0.079232368740253, + "grad_norm": 1.3975650072097778, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6747, + "step": 5487 + }, + { + "epoch": 0.07968000924161035, + "grad_norm": 1.3912913799285889, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6756, + "step": 5518 + }, + { + "epoch": 0.08012764974296771, + "grad_norm": 1.7227635383605957, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6694, + "step": 5549 + }, + { + "epoch": 0.08057529024432507, + "grad_norm": 1.3151129484176636, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6715, + "step": 5580 + }, + { + "epoch": 0.08102293074568244, + "grad_norm": 1.0972425937652588, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6661, + "step": 5611 + }, + { + "epoch": 0.0814705712470398, + "grad_norm": 1.2346575260162354, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.66, + "step": 5642 + }, + { + "epoch": 0.08191821174839715, + "grad_norm": 1.4536516666412354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6743, + "step": 5673 + }, + { + "epoch": 0.08236585224975453, + "grad_norm": 1.1361631155014038, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6791, + "step": 5704 + }, + { + "epoch": 0.08281349275111188, + "grad_norm": 1.2605111598968506, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6797, + "step": 5735 + }, + { + "epoch": 0.08326113325246924, + "grad_norm": 1.2355903387069702, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6642, + "step": 5766 + }, + { + "epoch": 0.0837087737538266, + "grad_norm": 1.6677048206329346, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6759, + "step": 5797 + }, + { + "epoch": 0.08415641425518397, + "grad_norm": 1.4781981706619263, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6707, + "step": 5828 + }, + { + "epoch": 0.08460405475654133, + "grad_norm": 1.138583779335022, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6636, + "step": 5859 + }, + { + "epoch": 0.08505169525789869, + "grad_norm": 1.529036283493042, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6705, + "step": 5890 + }, + { + "epoch": 0.08549933575925606, + "grad_norm": 1.3216760158538818, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6714, + "step": 5921 + }, + { + "epoch": 0.08594697626061341, + "grad_norm": 1.1820168495178223, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6757, + "step": 5952 + }, + { + "epoch": 0.08639461676197077, + "grad_norm": 1.393571138381958, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6672, + "step": 5983 + }, + { + "epoch": 0.08684225726332813, + "grad_norm": 1.4574682712554932, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6655, + "step": 6014 + }, + { + "epoch": 0.0872898977646855, + "grad_norm": 1.138645887374878, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6663, + "step": 6045 + }, + { + "epoch": 0.08773753826604286, + "grad_norm": 1.7602777481079102, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6695, + "step": 6076 + }, + { + "epoch": 0.08818517876740022, + "grad_norm": 1.2323459386825562, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6636, + "step": 6107 + }, + { + "epoch": 0.08863281926875759, + "grad_norm": 1.6881431341171265, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6653, + "step": 6138 + }, + { + "epoch": 0.08908045977011494, + "grad_norm": 1.391417384147644, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6636, + "step": 6169 + }, + { + "epoch": 0.0895281002714723, + "grad_norm": 1.3066257238388062, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6777, + "step": 6200 + }, + { + "epoch": 0.08997574077282966, + "grad_norm": 1.2001326084136963, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6683, + "step": 6231 + }, + { + "epoch": 0.09042338127418703, + "grad_norm": 1.4361172914505005, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6634, + "step": 6262 + }, + { + "epoch": 0.09087102177554439, + "grad_norm": 8.04520320892334, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6752, + "step": 6293 + }, + { + "epoch": 0.09131866227690175, + "grad_norm": 1.4874210357666016, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6733, + "step": 6324 + }, + { + "epoch": 0.09176630277825912, + "grad_norm": 1.234491229057312, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.668, + "step": 6355 + }, + { + "epoch": 0.09221394327961648, + "grad_norm": 1.2088687419891357, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6705, + "step": 6386 + }, + { + "epoch": 0.09266158378097383, + "grad_norm": 1.1937814950942993, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6853, + "step": 6417 + }, + { + "epoch": 0.09310922428233119, + "grad_norm": 1.5205374956130981, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.685, + "step": 6448 + }, + { + "epoch": 0.09355686478368856, + "grad_norm": 1.2221660614013672, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6622, + "step": 6479 + }, + { + "epoch": 0.09400450528504592, + "grad_norm": 1.0762608051300049, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.669, + "step": 6510 + }, + { + "epoch": 0.09445214578640328, + "grad_norm": 1.4416946172714233, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6674, + "step": 6541 + }, + { + "epoch": 0.09489978628776065, + "grad_norm": 1.559158205986023, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6681, + "step": 6572 + }, + { + "epoch": 0.095347426789118, + "grad_norm": 1.3833891153335571, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6681, + "step": 6603 + }, + { + "epoch": 0.09579506729047536, + "grad_norm": 1.6753300428390503, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6658, + "step": 6634 + }, + { + "epoch": 0.09624270779183272, + "grad_norm": 1.951198697090149, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6654, + "step": 6665 + }, + { + "epoch": 0.09669034829319009, + "grad_norm": 1.2356919050216675, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6651, + "step": 6696 + }, + { + "epoch": 0.09713798879454745, + "grad_norm": 1.2712593078613281, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6646, + "step": 6727 + }, + { + "epoch": 0.09758562929590481, + "grad_norm": 1.1935900449752808, + "learning_rate": 4.608840417313604e-05, + "loss": 0.674, + "step": 6758 + }, + { + "epoch": 0.09803326979726218, + "grad_norm": 1.1649430990219116, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6644, + "step": 6789 + }, + { + "epoch": 0.09848091029861954, + "grad_norm": 1.4281456470489502, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6673, + "step": 6820 + }, + { + "epoch": 0.0989285507999769, + "grad_norm": 1.3064521551132202, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6584, + "step": 6851 + }, + { + "epoch": 0.09937619130133425, + "grad_norm": 1.2546554803848267, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6612, + "step": 6882 + }, + { + "epoch": 0.09982383180269162, + "grad_norm": 1.1866974830627441, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6612, + "step": 6913 + }, + { + "epoch": 0.10027147230404898, + "grad_norm": 1.6166640520095825, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6551, + "step": 6944 + }, + { + "epoch": 0.10071911280540634, + "grad_norm": 1.5471700429916382, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6607, + "step": 6975 + }, + { + "epoch": 0.10116675330676371, + "grad_norm": 1.3361026048660278, + "learning_rate": 4.571999560773736e-05, + "loss": 0.666, + "step": 7006 + }, + { + "epoch": 0.10161439380812107, + "grad_norm": 1.2938140630722046, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6547, + "step": 7037 + }, + { + "epoch": 0.10206203430947842, + "grad_norm": 1.2688400745391846, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6624, + "step": 7068 + }, + { + "epoch": 0.10250967481083578, + "grad_norm": 1.6306285858154297, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6627, + "step": 7099 + }, + { + "epoch": 0.10295731531219315, + "grad_norm": 1.3346176147460938, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6577, + "step": 7130 + }, + { + "epoch": 0.10340495581355051, + "grad_norm": 1.0933984518051147, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6602, + "step": 7161 + }, + { + "epoch": 0.10385259631490787, + "grad_norm": 1.575859785079956, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6593, + "step": 7192 + }, + { + "epoch": 0.10430023681626524, + "grad_norm": 1.4265861511230469, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6564, + "step": 7223 + }, + { + "epoch": 0.1047478773176226, + "grad_norm": 1.737012267112732, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6593, + "step": 7254 + }, + { + "epoch": 0.10519551781897996, + "grad_norm": 1.0257115364074707, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6514, + "step": 7285 + }, + { + "epoch": 0.10564315832033731, + "grad_norm": 1.5043773651123047, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6635, + "step": 7316 + }, + { + "epoch": 0.10609079882169468, + "grad_norm": 1.5642234086990356, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6558, + "step": 7347 + }, + { + "epoch": 0.10653843932305204, + "grad_norm": 1.1821067333221436, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6686, + "step": 7378 + }, + { + "epoch": 0.1069860798244094, + "grad_norm": 1.5492759943008423, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6543, + "step": 7409 + }, + { + "epoch": 0.10743372032576677, + "grad_norm": 1.281914234161377, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6516, + "step": 7440 + }, + { + "epoch": 0.10788136082712413, + "grad_norm": 1.3318305015563965, + "learning_rate": 4.498911497712155e-05, + "loss": 0.656, + "step": 7471 + }, + { + "epoch": 0.10832900132848149, + "grad_norm": 1.3832449913024902, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6475, + "step": 7502 + }, + { + "epoch": 0.10877664182983884, + "grad_norm": 1.3547158241271973, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6594, + "step": 7533 + }, + { + "epoch": 0.10922428233119622, + "grad_norm": 1.4633681774139404, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6707, + "step": 7564 + }, + { + "epoch": 0.10967192283255357, + "grad_norm": 1.1781059503555298, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6626, + "step": 7595 + }, + { + "epoch": 0.11011956333391093, + "grad_norm": 1.4727883338928223, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.6535, + "step": 7626 + }, + { + "epoch": 0.1105672038352683, + "grad_norm": 1.3888640403747559, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6606, + "step": 7657 + }, + { + "epoch": 0.11101484433662566, + "grad_norm": 1.308769941329956, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6667, + "step": 7688 + }, + { + "epoch": 0.11146248483798302, + "grad_norm": 1.3579630851745605, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6621, + "step": 7719 + }, + { + "epoch": 0.11191012533934037, + "grad_norm": 1.1285645961761475, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6597, + "step": 7750 + }, + { + "epoch": 0.11235776584069775, + "grad_norm": 1.1144675016403198, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6638, + "step": 7781 + }, + { + "epoch": 0.1128054063420551, + "grad_norm": 1.1993658542633057, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6704, + "step": 7812 + }, + { + "epoch": 0.11325304684341246, + "grad_norm": 1.0608967542648315, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6573, + "step": 7843 + }, + { + "epoch": 0.11370068734476983, + "grad_norm": 1.1210070848464966, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6612, + "step": 7874 + }, + { + "epoch": 0.11414832784612719, + "grad_norm": 1.2345409393310547, + "learning_rate": 4.42611386459262e-05, + "loss": 0.65, + "step": 7905 + }, + { + "epoch": 0.11459596834748455, + "grad_norm": 1.077025294303894, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6595, + "step": 7936 + }, + { + "epoch": 0.1150436088488419, + "grad_norm": 1.2079277038574219, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.6442, + "step": 7967 + }, + { + "epoch": 0.11549124935019928, + "grad_norm": 1.6710035800933838, + "learning_rate": 4.409954541451762e-05, + "loss": 0.663, + "step": 7998 + }, + { + "epoch": 0.11593888985155663, + "grad_norm": 1.3124401569366455, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6512, + "step": 8029 + }, + { + "epoch": 0.11638653035291399, + "grad_norm": 1.644904375076294, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6587, + "step": 8060 + }, + { + "epoch": 0.11683417085427136, + "grad_norm": 1.181624174118042, + "learning_rate": 4.393601237573607e-05, + "loss": 0.653, + "step": 8091 + }, + { + "epoch": 0.11728181135562872, + "grad_norm": 1.4587918519973755, + "learning_rate": 4.388107315953628e-05, + "loss": 0.675, + "step": 8122 + }, + { + "epoch": 0.11772945185698608, + "grad_norm": 1.2147635221481323, + "learning_rate": 4.382592087299212e-05, + "loss": 0.6521, + "step": 8153 + }, + { + "epoch": 0.11817709235834344, + "grad_norm": 1.0448981523513794, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6541, + "step": 8184 + }, + { + "epoch": 0.11862473285970081, + "grad_norm": 1.4482290744781494, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6563, + "step": 8215 + }, + { + "epoch": 0.11907237336105816, + "grad_norm": 1.1621575355529785, + "learning_rate": 4.365919182713416e-05, + "loss": 0.656, + "step": 8246 + }, + { + "epoch": 0.11952001386241552, + "grad_norm": 1.1643873453140259, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6547, + "step": 8277 + }, + { + "epoch": 0.1199676543637729, + "grad_norm": 1.4016129970550537, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6582, + "step": 8308 + }, + { + "epoch": 0.12041529486513025, + "grad_norm": 1.4023685455322266, + "learning_rate": 4.349056769754021e-05, + "loss": 0.6621, + "step": 8339 + }, + { + "epoch": 0.12086293536648761, + "grad_norm": 1.3020285367965698, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6674, + "step": 8370 + }, + { + "epoch": 0.12131057586784497, + "grad_norm": 1.2162435054779053, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6614, + "step": 8401 + }, + { + "epoch": 0.12175821636920234, + "grad_norm": 1.1956969499588013, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6557, + "step": 8432 + }, + { + "epoch": 0.1222058568705597, + "grad_norm": 1.1723664999008179, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6633, + "step": 8463 + }, + { + "epoch": 0.12265349737191705, + "grad_norm": 1.113020658493042, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6577, + "step": 8494 + }, + { + "epoch": 0.12310113787327442, + "grad_norm": 1.2453004121780396, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6544, + "step": 8525 + }, + { + "epoch": 0.12354877837463178, + "grad_norm": 1.1493890285491943, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6533, + "step": 8556 + }, + { + "epoch": 0.12399641887598914, + "grad_norm": 1.4172496795654297, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6542, + "step": 8587 + }, + { + "epoch": 0.1244440593773465, + "grad_norm": 1.1840728521347046, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6476, + "step": 8618 + }, + { + "epoch": 0.12489169987870387, + "grad_norm": 1.3720282316207886, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6446, + "step": 8649 + }, + { + "epoch": 0.12533934038006123, + "grad_norm": 1.1705596446990967, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6537, + "step": 8680 + }, + { + "epoch": 0.1257869808814186, + "grad_norm": 1.0790083408355713, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6543, + "step": 8711 + }, + { + "epoch": 0.12623462138277594, + "grad_norm": 1.1207470893859863, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6468, + "step": 8742 + }, + { + "epoch": 0.1266822618841333, + "grad_norm": 1.1123065948486328, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6596, + "step": 8773 + }, + { + "epoch": 0.12712990238549068, + "grad_norm": 1.1579636335372925, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6458, + "step": 8804 + }, + { + "epoch": 0.12757754288684803, + "grad_norm": 1.3112802505493164, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6483, + "step": 8835 + }, + { + "epoch": 0.1280251833882054, + "grad_norm": 1.1104832887649536, + "learning_rate": 4.250007230372134e-05, + "loss": 0.645, + "step": 8866 + }, + { + "epoch": 0.12847282388956274, + "grad_norm": 1.0218713283538818, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6519, + "step": 8897 + }, + { + "epoch": 0.12892046439092011, + "grad_norm": 1.0053678750991821, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.6468, + "step": 8928 + }, + { + "epoch": 0.12936810489227749, + "grad_norm": 1.410933017730713, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6573, + "step": 8959 + }, + { + "epoch": 0.12981574539363483, + "grad_norm": 1.5001798868179321, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6483, + "step": 8990 + }, + { + "epoch": 0.1302633858949922, + "grad_norm": 1.112316608428955, + "learning_rate": 4.219774185874569e-05, + "loss": 0.6483, + "step": 9021 + }, + { + "epoch": 0.13071102639634957, + "grad_norm": 1.527464747428894, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6512, + "step": 9052 + }, + { + "epoch": 0.13115866689770692, + "grad_norm": 1.1075704097747803, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6577, + "step": 9083 + }, + { + "epoch": 0.1316063073990643, + "grad_norm": 0.9589399099349976, + "learning_rate": 4.201400923825648e-05, + "loss": 0.642, + "step": 9114 + }, + { + "epoch": 0.13205394790042166, + "grad_norm": 1.186531901359558, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6553, + "step": 9145 + }, + { + "epoch": 0.132501588401779, + "grad_norm": 1.1176280975341797, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6483, + "step": 9176 + }, + { + "epoch": 0.13294922890313637, + "grad_norm": 1.4222681522369385, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6462, + "step": 9207 + }, + { + "epoch": 0.13339686940449375, + "grad_norm": 1.1606040000915527, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6514, + "step": 9238 + }, + { + "epoch": 0.1338445099058511, + "grad_norm": 1.313774585723877, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6422, + "step": 9269 + }, + { + "epoch": 0.13429215040720846, + "grad_norm": 1.1994171142578125, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6444, + "step": 9300 + }, + { + "epoch": 0.1347397909085658, + "grad_norm": 0.9376353025436401, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6436, + "step": 9331 + }, + { + "epoch": 0.13518743140992318, + "grad_norm": 1.0350178480148315, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6447, + "step": 9362 + }, + { + "epoch": 0.13563507191128055, + "grad_norm": 1.3630082607269287, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6427, + "step": 9393 + }, + { + "epoch": 0.1360827124126379, + "grad_norm": 1.0362364053726196, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6443, + "step": 9424 + }, + { + "epoch": 0.13653035291399526, + "grad_norm": 1.1442211866378784, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.6523, + "step": 9455 + }, + { + "epoch": 0.13697799341535263, + "grad_norm": 1.4196938276290894, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6501, + "step": 9486 + }, + { + "epoch": 0.13742563391670998, + "grad_norm": 1.3853130340576172, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.6431, + "step": 9517 + }, + { + "epoch": 0.13787327441806735, + "grad_norm": 1.0350273847579956, + "learning_rate": 4.113404868280107e-05, + "loss": 0.6436, + "step": 9548 + }, + { + "epoch": 0.13832091491942472, + "grad_norm": 1.0520857572555542, + "learning_rate": 4.106980553625457e-05, + "loss": 0.6436, + "step": 9579 + }, + { + "epoch": 0.13876855542078206, + "grad_norm": 1.127038836479187, + "learning_rate": 4.100538104413674e-05, + "loss": 0.639, + "step": 9610 + }, + { + "epoch": 0.13921619592213944, + "grad_norm": 1.1070880889892578, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6366, + "step": 9641 + }, + { + "epoch": 0.1396638364234968, + "grad_norm": 1.3045605421066284, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6496, + "step": 9672 + }, + { + "epoch": 0.14011147692485415, + "grad_norm": 1.234647512435913, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6383, + "step": 9703 + }, + { + "epoch": 0.14055911742621152, + "grad_norm": 1.1154453754425049, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6454, + "step": 9734 + }, + { + "epoch": 0.14100675792756887, + "grad_norm": 1.2422186136245728, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6404, + "step": 9765 + }, + { + "epoch": 0.14145439842892624, + "grad_norm": 1.2128278017044067, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6376, + "step": 9796 + }, + { + "epoch": 0.1419020389302836, + "grad_norm": 1.3140804767608643, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.6316, + "step": 9827 + }, + { + "epoch": 0.14234967943164095, + "grad_norm": 1.9198187589645386, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6383, + "step": 9858 + }, + { + "epoch": 0.14279731993299832, + "grad_norm": 1.144679307937622, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.6383, + "step": 9889 + }, + { + "epoch": 0.1432449604343557, + "grad_norm": 1.1679338216781616, + "learning_rate": 4.035132306369438e-05, + "loss": 0.6414, + "step": 9920 + }, + { + "epoch": 0.14369260093571304, + "grad_norm": 0.9563717246055603, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6327, + "step": 9951 + }, + { + "epoch": 0.1441402414370704, + "grad_norm": 1.277036428451538, + "learning_rate": 4.021840884378864e-05, + "loss": 0.6365, + "step": 9982 + }, + { + "epoch": 0.14458788193842778, + "grad_norm": 0.9835182428359985, + "learning_rate": 4.015169375185633e-05, + "loss": 0.638, + "step": 10013 + }, + { + "epoch": 0.14503552243978513, + "grad_norm": 1.090118646621704, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.6437, + "step": 10044 + }, + { + "epoch": 0.1454831629411425, + "grad_norm": 1.1823488473892212, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6326, + "step": 10075 + }, + { + "epoch": 0.14593080344249987, + "grad_norm": 1.070318341255188, + "learning_rate": 3.995052558835377e-05, + "loss": 0.6362, + "step": 10106 + }, + { + "epoch": 0.1463784439438572, + "grad_norm": 1.2451491355895996, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6388, + "step": 10137 + }, + { + "epoch": 0.14682608444521458, + "grad_norm": 1.2417365312576294, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6309, + "step": 10168 + }, + { + "epoch": 0.14727372494657193, + "grad_norm": 1.251518726348877, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6365, + "step": 10199 + }, + { + "epoch": 0.1477213654479293, + "grad_norm": 1.359750747680664, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.6386, + "step": 10230 + }, + { + "epoch": 0.14816900594928667, + "grad_norm": 1.1073262691497803, + "learning_rate": 3.961188120762596e-05, + "loss": 0.635, + "step": 10261 + }, + { + "epoch": 0.148616646450644, + "grad_norm": 0.9850608706474304, + "learning_rate": 3.954365458554938e-05, + "loss": 0.6389, + "step": 10292 + }, + { + "epoch": 0.14906428695200138, + "grad_norm": 1.2984429597854614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.6317, + "step": 10323 + }, + { + "epoch": 0.14951192745335876, + "grad_norm": 1.1622575521469116, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6393, + "step": 10354 + }, + { + "epoch": 0.1499595679547161, + "grad_norm": 1.0856871604919434, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6351, + "step": 10385 + }, + { + "epoch": 0.15040720845607347, + "grad_norm": 1.0153882503509521, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6282, + "step": 10416 + }, + { + "epoch": 0.15085484895743084, + "grad_norm": 1.2039254903793335, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6365, + "step": 10447 + }, + { + "epoch": 0.1513024894587882, + "grad_norm": 1.1179555654525757, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6311, + "step": 10478 + }, + { + "epoch": 0.15175012996014556, + "grad_norm": 1.090903639793396, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.6401, + "step": 10509 + }, + { + "epoch": 0.15219777046150293, + "grad_norm": 0.9228240251541138, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.6323, + "step": 10540 + }, + { + "epoch": 0.15264541096286027, + "grad_norm": 1.0772743225097656, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.637, + "step": 10571 + }, + { + "epoch": 0.15309305146421764, + "grad_norm": 1.0854771137237549, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6375, + "step": 10602 + }, + { + "epoch": 0.153540691965575, + "grad_norm": 1.3902987241744995, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.6406, + "step": 10633 + }, + { + "epoch": 0.15398833246693236, + "grad_norm": 1.180351734161377, + "learning_rate": 3.871235554965218e-05, + "loss": 0.626, + "step": 10664 + }, + { + "epoch": 0.15443597296828973, + "grad_norm": 1.1136449575424194, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6395, + "step": 10695 + }, + { + "epoch": 0.15488361346964707, + "grad_norm": 1.1770708560943604, + "learning_rate": 3.857160259406107e-05, + "loss": 0.633, + "step": 10726 + }, + { + "epoch": 0.15533125397100445, + "grad_norm": 1.1615066528320312, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.632, + "step": 10757 + }, + { + "epoch": 0.15577889447236182, + "grad_norm": 1.2898380756378174, + "learning_rate": 3.843023702543556e-05, + "loss": 0.6332, + "step": 10788 + }, + { + "epoch": 0.15622653497371916, + "grad_norm": 1.0051672458648682, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.6353, + "step": 10819 + }, + { + "epoch": 0.15667417547507653, + "grad_norm": 0.9514272212982178, + "learning_rate": 3.828826522492255e-05, + "loss": 0.6383, + "step": 10850 + }, + { + "epoch": 0.1571218159764339, + "grad_norm": 1.2570873498916626, + "learning_rate": 3.821705398930713e-05, + "loss": 0.6308, + "step": 10881 + }, + { + "epoch": 0.15756945647779125, + "grad_norm": 0.992323637008667, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6303, + "step": 10912 + }, + { + "epoch": 0.15801709697914862, + "grad_norm": 1.255265474319458, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6349, + "step": 10943 + }, + { + "epoch": 0.158464737480506, + "grad_norm": 1.1066702604293823, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6317, + "step": 10974 + }, + { + "epoch": 0.15891237798186333, + "grad_norm": 1.178690791130066, + "learning_rate": 3.793072558155093e-05, + "loss": 0.639, + "step": 11005 + }, + { + "epoch": 0.1593600184832207, + "grad_norm": 1.0850341320037842, + "learning_rate": 3.785877665226426e-05, + "loss": 0.6375, + "step": 11036 + }, + { + "epoch": 0.15980765898457805, + "grad_norm": 1.1378651857376099, + "learning_rate": 3.778668261343079e-05, + "loss": 0.6287, + "step": 11067 + }, + { + "epoch": 0.16025529948593542, + "grad_norm": 1.07688570022583, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6261, + "step": 11098 + }, + { + "epoch": 0.1607029399872928, + "grad_norm": 1.108269453048706, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.6352, + "step": 11129 + }, + { + "epoch": 0.16115058048865014, + "grad_norm": 1.2582095861434937, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.1615982209900075, + "grad_norm": 0.9823578000068665, + "learning_rate": 3.749687165842753e-05, + "loss": 0.6253, + "step": 11191 + }, + { + "epoch": 0.16204586149136488, + "grad_norm": 1.3922805786132812, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.6279, + "step": 11222 + }, + { + "epoch": 0.16249350199272222, + "grad_norm": 1.2210962772369385, + "learning_rate": 3.735111675341645e-05, + "loss": 0.6357, + "step": 11253 + }, + { + "epoch": 0.1629411424940796, + "grad_norm": 1.0463316440582275, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.6332, + "step": 11284 + }, + { + "epoch": 0.16338878299543697, + "grad_norm": 1.165583848953247, + "learning_rate": 3.720480432728287e-05, + "loss": 0.627, + "step": 11315 + }, + { + "epoch": 0.1638364234967943, + "grad_norm": 1.0995306968688965, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6283, + "step": 11346 + }, + { + "epoch": 0.16428406399815168, + "grad_norm": 1.0279158353805542, + "learning_rate": 3.70579409844715e-05, + "loss": 0.6287, + "step": 11377 + }, + { + "epoch": 0.16473170449950905, + "grad_norm": 1.51092529296875, + "learning_rate": 3.698430479000865e-05, + "loss": 0.6261, + "step": 11408 + }, + { + "epoch": 0.1651793450008664, + "grad_norm": 1.020936369895935, + "learning_rate": 3.691053335429509e-05, + "loss": 0.6327, + "step": 11439 + }, + { + "epoch": 0.16562698550222377, + "grad_norm": 1.0198683738708496, + "learning_rate": 3.683662750983147e-05, + "loss": 0.6422, + "step": 11470 + }, + { + "epoch": 0.1660746260035811, + "grad_norm": 1.2650995254516602, + "learning_rate": 3.676258809063518e-05, + "loss": 0.6354, + "step": 11501 + }, + { + "epoch": 0.16652226650493848, + "grad_norm": 1.1653568744659424, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6325, + "step": 11532 + }, + { + "epoch": 0.16696990700629585, + "grad_norm": 1.1461430788040161, + "learning_rate": 3.661411187164166e-05, + "loss": 0.6251, + "step": 11563 + }, + { + "epoch": 0.1674175475076532, + "grad_norm": 1.2535974979400635, + "learning_rate": 3.65396767473784e-05, + "loss": 0.6245, + "step": 11594 + }, + { + "epoch": 0.16786518800901057, + "grad_norm": 1.115191102027893, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.6294, + "step": 11625 + }, + { + "epoch": 0.16831282851036794, + "grad_norm": 1.0482964515686035, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.6247, + "step": 11656 + }, + { + "epoch": 0.16876046901172528, + "grad_norm": 1.1431951522827148, + "learning_rate": 3.63155933997859e-05, + "loss": 0.63, + "step": 11687 + }, + { + "epoch": 0.16920810951308266, + "grad_norm": 1.0254175662994385, + "learning_rate": 3.624064243537758e-05, + "loss": 0.6212, + "step": 11718 + }, + { + "epoch": 0.16965575001444003, + "grad_norm": 0.9481080174446106, + "learning_rate": 3.616556462184716e-05, + "loss": 0.6278, + "step": 11749 + }, + { + "epoch": 0.17010339051579737, + "grad_norm": 1.118394374847412, + "learning_rate": 3.609036080643755e-05, + "loss": 0.6244, + "step": 11780 + }, + { + "epoch": 0.17055103101715474, + "grad_norm": 1.1592167615890503, + "learning_rate": 3.60150318378136e-05, + "loss": 0.621, + "step": 11811 + }, + { + "epoch": 0.1709986715185121, + "grad_norm": 0.9984686374664307, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.6319, + "step": 11842 + }, + { + "epoch": 0.17144631201986946, + "grad_norm": 1.0091164112091064, + "learning_rate": 3.586400184263408e-05, + "loss": 0.6345, + "step": 11873 + }, + { + "epoch": 0.17189395252122683, + "grad_norm": 1.0355888605117798, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6171, + "step": 11904 + }, + { + "epoch": 0.17234159302258417, + "grad_norm": 1.1437592506408691, + "learning_rate": 3.571248145370125e-05, + "loss": 0.6201, + "step": 11935 + }, + { + "epoch": 0.17278923352394154, + "grad_norm": 0.9440962672233582, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.6236, + "step": 11966 + }, + { + "epoch": 0.17323687402529891, + "grad_norm": 0.9761082530021667, + "learning_rate": 3.556047751054378e-05, + "loss": 0.6291, + "step": 11997 + }, + { + "epoch": 0.17368451452665626, + "grad_norm": 1.1858127117156982, + "learning_rate": 3.548429634946039e-05, + "loss": 0.6299, + "step": 12028 + }, + { + "epoch": 0.17413215502801363, + "grad_norm": 1.0180195569992065, + "learning_rate": 3.540799687451768e-05, + "loss": 0.6227, + "step": 12059 + }, + { + "epoch": 0.174579795529371, + "grad_norm": 0.9683852195739746, + "learning_rate": 3.533157994674485e-05, + "loss": 0.626, + "step": 12090 + }, + { + "epoch": 0.17502743603072834, + "grad_norm": 1.0338289737701416, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.6377, + "step": 12121 + }, + { + "epoch": 0.17547507653208572, + "grad_norm": 1.1238298416137695, + "learning_rate": 3.517839718344311e-05, + "loss": 0.6338, + "step": 12152 + }, + { + "epoch": 0.1759227170334431, + "grad_norm": 1.0541973114013672, + "learning_rate": 3.510163307656086e-05, + "loss": 0.6222, + "step": 12183 + }, + { + "epoch": 0.17637035753480043, + "grad_norm": 1.1677592992782593, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.6256, + "step": 12214 + }, + { + "epoch": 0.1768179980361578, + "grad_norm": 1.0191985368728638, + "learning_rate": 3.494776374368643e-05, + "loss": 0.6301, + "step": 12245 + }, + { + "epoch": 0.17726563853751517, + "grad_norm": 1.1535918712615967, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.6253, + "step": 12276 + }, + { + "epoch": 0.17771327903887252, + "grad_norm": 1.0887985229492188, + "learning_rate": 3.479344537543164e-05, + "loss": 0.6335, + "step": 12307 + }, + { + "epoch": 0.1781609195402299, + "grad_norm": 1.010688066482544, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.6251, + "step": 12338 + }, + { + "epoch": 0.17860856004158723, + "grad_norm": 0.9745127558708191, + "learning_rate": 3.463868493762412e-05, + "loss": 0.6241, + "step": 12369 + }, + { + "epoch": 0.1790562005429446, + "grad_norm": 1.0414716005325317, + "learning_rate": 3.456114112492418e-05, + "loss": 0.6237, + "step": 12400 + }, + { + "epoch": 0.17950384104430198, + "grad_norm": 1.0457465648651123, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.6258, + "step": 12431 + }, + { + "epoch": 0.17995148154565932, + "grad_norm": 1.0389049053192139, + "learning_rate": 3.440573068727905e-05, + "loss": 0.6262, + "step": 12462 + }, + { + "epoch": 0.1803991220470167, + "grad_norm": 1.255600094795227, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.6305, + "step": 12493 + }, + { + "epoch": 0.18084676254837406, + "grad_norm": 1.0340358018875122, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.6212, + "step": 12524 + }, + { + "epoch": 0.1812944030497314, + "grad_norm": 1.0317034721374512, + "learning_rate": 3.417182116258899e-05, + "loss": 0.6279, + "step": 12555 + }, + { + "epoch": 0.18174204355108878, + "grad_norm": 1.1320221424102783, + "learning_rate": 3.409364314116074e-05, + "loss": 0.631, + "step": 12586 + }, + { + "epoch": 0.18218968405244615, + "grad_norm": 0.9674787521362305, + "learning_rate": 3.401536249920559e-05, + "loss": 0.627, + "step": 12617 + }, + { + "epoch": 0.1826373245538035, + "grad_norm": 0.9329623579978943, + "learning_rate": 3.393698012010998e-05, + "loss": 0.6244, + "step": 12648 + }, + { + "epoch": 0.18308496505516086, + "grad_norm": 1.2081501483917236, + "learning_rate": 3.385849688840839e-05, + "loss": 0.6295, + "step": 12679 + }, + { + "epoch": 0.18353260555651824, + "grad_norm": 0.9842090010643005, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.6276, + "step": 12710 + }, + { + "epoch": 0.18398024605787558, + "grad_norm": 1.1417752504348755, + "learning_rate": 3.370123141100578e-05, + "loss": 0.6266, + "step": 12741 + }, + { + "epoch": 0.18442788655923295, + "grad_norm": 0.9693592190742493, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.6245, + "step": 12772 + }, + { + "epoch": 0.1848755270605903, + "grad_norm": 1.1920111179351807, + "learning_rate": 3.35435731658559e-05, + "loss": 0.6227, + "step": 12803 + }, + { + "epoch": 0.18532316756194767, + "grad_norm": 0.9865401387214661, + "learning_rate": 3.346459897862552e-05, + "loss": 0.6287, + "step": 12834 + }, + { + "epoch": 0.18577080806330504, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.338552926954613e-05, + "loss": 0.6236, + "step": 12865 + }, + { + "epoch": 0.18621844856466238, + "grad_norm": 1.0202548503875732, + "learning_rate": 3.330636493090868e-05, + "loss": 0.6269, + "step": 12896 + }, + { + "epoch": 0.18666608906601975, + "grad_norm": 1.1385433673858643, + "learning_rate": 3.322710685607193e-05, + "loss": 0.6385, + "step": 12927 + }, + { + "epoch": 0.18711372956737712, + "grad_norm": 1.0102901458740234, + "learning_rate": 3.314775593945251e-05, + "loss": 0.6241, + "step": 12958 + }, + { + "epoch": 0.18756137006873447, + "grad_norm": 0.9830989241600037, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.6243, + "step": 12989 + }, + { + "epoch": 0.18800901057009184, + "grad_norm": 1.0044376850128174, + "learning_rate": 3.298877916376047e-05, + "loss": 0.619, + "step": 13020 + }, + { + "epoch": 0.1884566510714492, + "grad_norm": 1.0714712142944336, + "learning_rate": 3.290915509871915e-05, + "loss": 0.6243, + "step": 13051 + }, + { + "epoch": 0.18890429157280655, + "grad_norm": 0.9379229545593262, + "learning_rate": 3.282944177993753e-05, + "loss": 0.6216, + "step": 13082 + }, + { + "epoch": 0.18935193207416393, + "grad_norm": 1.2717514038085938, + "learning_rate": 3.274964010696957e-05, + "loss": 0.6206, + "step": 13113 + }, + { + "epoch": 0.1897995725755213, + "grad_norm": 1.1147576570510864, + "learning_rate": 3.266975098036629e-05, + "loss": 0.6234, + "step": 13144 + }, + { + "epoch": 0.19024721307687864, + "grad_norm": 0.9994730949401855, + "learning_rate": 3.258977530166562e-05, + "loss": 0.6146, + "step": 13175 + }, + { + "epoch": 0.190694853578236, + "grad_norm": 1.195367693901062, + "learning_rate": 3.250971397338227e-05, + "loss": 0.624, + "step": 13206 + }, + { + "epoch": 0.19114249407959336, + "grad_norm": 1.0008747577667236, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.6182, + "step": 13237 + }, + { + "epoch": 0.19159013458095073, + "grad_norm": 1.3223299980163574, + "learning_rate": 3.234933798294859e-05, + "loss": 0.6193, + "step": 13268 + }, + { + "epoch": 0.1920377750823081, + "grad_norm": 1.1946437358856201, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.6201, + "step": 13299 + }, + { + "epoch": 0.19248541558366544, + "grad_norm": 1.1597986221313477, + "learning_rate": 3.218863024832985e-05, + "loss": 0.6212, + "step": 13330 + }, + { + "epoch": 0.1929330560850228, + "grad_norm": 0.9518936276435852, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.6154, + "step": 13361 + }, + { + "epoch": 0.19338069658638019, + "grad_norm": 0.890487790107727, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.6203, + "step": 13392 + }, + { + "epoch": 0.19382833708773753, + "grad_norm": 0.9918534755706787, + "learning_rate": 3.194696249871729e-05, + "loss": 0.6319, + "step": 13423 + }, + { + "epoch": 0.1942759775890949, + "grad_norm": 1.1954073905944824, + "learning_rate": 3.186624857814164e-05, + "loss": 0.619, + "step": 13454 + }, + { + "epoch": 0.19472361809045227, + "grad_norm": 1.1521157026290894, + "learning_rate": 3.178545717288401e-05, + "loss": 0.6326, + "step": 13485 + }, + { + "epoch": 0.19517125859180962, + "grad_norm": 1.0131208896636963, + "learning_rate": 3.170458919466444e-05, + "loss": 0.6234, + "step": 13516 + }, + { + "epoch": 0.195618899093167, + "grad_norm": 1.0429494380950928, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.6146, + "step": 13547 + }, + { + "epoch": 0.19606653959452436, + "grad_norm": 0.9586461782455444, + "learning_rate": 3.154262717052985e-05, + "loss": 0.6192, + "step": 13578 + }, + { + "epoch": 0.1965141800958817, + "grad_norm": 0.9385515451431274, + "learning_rate": 3.146153495233426e-05, + "loss": 0.6186, + "step": 13609 + }, + { + "epoch": 0.19696182059723907, + "grad_norm": 0.9109722375869751, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.6223, + "step": 13640 + }, + { + "epoch": 0.19740946109859642, + "grad_norm": 1.0564444065093994, + "learning_rate": 3.129913267924946e-05, + "loss": 0.6235, + "step": 13671 + }, + { + "epoch": 0.1978571015999538, + "grad_norm": 1.1656286716461182, + "learning_rate": 3.121782445704782e-05, + "loss": 0.6176, + "step": 13702 + }, + { + "epoch": 0.19830474210131116, + "grad_norm": 1.1301069259643555, + "learning_rate": 3.11364460675423e-05, + "loss": 0.6253, + "step": 13733 + }, + { + "epoch": 0.1987523826026685, + "grad_norm": 0.9939395785331726, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.6223, + "step": 13764 + }, + { + "epoch": 0.19920002310402588, + "grad_norm": 1.2881885766983032, + "learning_rate": 3.097348246077728e-05, + "loss": 0.6177, + "step": 13795 + }, + { + "epoch": 0.19964766360538325, + "grad_norm": 1.1002579927444458, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.6139, + "step": 13826 + }, + { + "epoch": 0.2000953041067406, + "grad_norm": 1.045394778251648, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.6192, + "step": 13857 + }, + { + "epoch": 0.20054294460809796, + "grad_norm": 0.9559116959571838, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.6155, + "step": 13888 + }, + { + "epoch": 0.20099058510945533, + "grad_norm": 0.9250887036323547, + "learning_rate": 3.064675369851637e-05, + "loss": 0.6235, + "step": 13919 + }, + { + "epoch": 0.20143822561081268, + "grad_norm": 1.0655368566513062, + "learning_rate": 3.056490989455289e-05, + "loss": 0.628, + "step": 13950 + }, + { + "epoch": 0.20188586611217005, + "grad_norm": 1.07636559009552, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.6244, + "step": 13981 + }, + { + "epoch": 0.20233350661352742, + "grad_norm": 1.050580620765686, + "learning_rate": 3.040103481317539e-05, + "loss": 0.6222, + "step": 14012 + }, + { + "epoch": 0.20278114711488476, + "grad_norm": 1.3754404783248901, + "learning_rate": 3.03190053850694e-05, + "loss": 0.6151, + "step": 14043 + }, + { + "epoch": 0.20322878761624213, + "grad_norm": 1.0527547597885132, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.6153, + "step": 14074 + }, + { + "epoch": 0.20367642811759948, + "grad_norm": 0.9438226819038391, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.618, + "step": 14105 + }, + { + "epoch": 0.20412406861895685, + "grad_norm": 1.0383126735687256, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.6162, + "step": 14136 + }, + { + "epoch": 0.20457170912031422, + "grad_norm": 1.1412239074707031, + "learning_rate": 2.999029669712431e-05, + "loss": 0.6284, + "step": 14167 + }, + { + "epoch": 0.20501934962167156, + "grad_norm": 1.1064159870147705, + "learning_rate": 2.990797641805408e-05, + "loss": 0.6223, + "step": 14198 + }, + { + "epoch": 0.20546699012302894, + "grad_norm": 1.0044069290161133, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6191, + "step": 14229 + }, + { + "epoch": 0.2059146306243863, + "grad_norm": 0.9315604567527771, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.6207, + "step": 14260 + }, + { + "epoch": 0.20636227112574365, + "grad_norm": 0.941224217414856, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.6207, + "step": 14291 + }, + { + "epoch": 0.20680991162710102, + "grad_norm": 1.1239089965820312, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.6252, + "step": 14322 + }, + { + "epoch": 0.2072575521284584, + "grad_norm": 0.9484926462173462, + "learning_rate": 2.949556283611942e-05, + "loss": 0.6136, + "step": 14353 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.9437084197998047, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.6154, + "step": 14384 + }, + { + "epoch": 0.2081528331311731, + "grad_norm": 0.9578093886375427, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.6191, + "step": 14415 + }, + { + "epoch": 0.20860047363253048, + "grad_norm": 1.0657248497009277, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.6178, + "step": 14446 + }, + { + "epoch": 0.20904811413388782, + "grad_norm": 0.853568434715271, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6152, + "step": 14477 + }, + { + "epoch": 0.2094957546352452, + "grad_norm": 1.0403015613555908, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.6108, + "step": 14508 + }, + { + "epoch": 0.20994339513660254, + "grad_norm": 1.0344171524047852, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.6155, + "step": 14539 + }, + { + "epoch": 0.2103910356379599, + "grad_norm": 1.0755060911178589, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.6156, + "step": 14570 + }, + { + "epoch": 0.21083867613931728, + "grad_norm": 0.8636776208877563, + "learning_rate": 2.883311164593017e-05, + "loss": 0.6193, + "step": 14601 + }, + { + "epoch": 0.21128631664067463, + "grad_norm": 1.0264644622802734, + "learning_rate": 2.875010077160754e-05, + "loss": 0.6138, + "step": 14632 + }, + { + "epoch": 0.211733957142032, + "grad_norm": 1.2590196132659912, + "learning_rate": 2.866704757790741e-05, + "loss": 0.6202, + "step": 14663 + }, + { + "epoch": 0.21218159764338937, + "grad_norm": 1.1028645038604736, + "learning_rate": 2.858395300207376e-05, + "loss": 0.614, + "step": 14694 + }, + { + "epoch": 0.2126292381447467, + "grad_norm": 0.8904405236244202, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.6152, + "step": 14725 + }, + { + "epoch": 0.21307687864610408, + "grad_norm": 0.9810163974761963, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.6088, + "step": 14756 + }, + { + "epoch": 0.21352451914746146, + "grad_norm": 0.9837898015975952, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.6129, + "step": 14787 + }, + { + "epoch": 0.2139721596488188, + "grad_norm": 0.987639844417572, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.6081, + "step": 14818 + }, + { + "epoch": 0.21441980015017617, + "grad_norm": 1.1478586196899414, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.6146, + "step": 14849 + }, + { + "epoch": 0.21486744065153354, + "grad_norm": 1.0885242223739624, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.6183, + "step": 14880 + }, + { + "epoch": 0.21531508115289089, + "grad_norm": 0.9934699535369873, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.6157, + "step": 14911 + }, + { + "epoch": 0.21576272165424826, + "grad_norm": 0.9285492300987244, + "learning_rate": 2.791781925709473e-05, + "loss": 0.6196, + "step": 14942 + }, + { + "epoch": 0.2162103621556056, + "grad_norm": 1.243133783340454, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.6109, + "step": 14973 + }, + { + "epoch": 0.21665800265696297, + "grad_norm": 1.0712559223175049, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.6268, + "step": 15004 + }, + { + "epoch": 0.21710564315832034, + "grad_norm": 1.0762903690338135, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.6162, + "step": 15035 + }, + { + "epoch": 0.2175532836596777, + "grad_norm": 1.043479084968567, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6174, + "step": 15066 + }, + { + "epoch": 0.21800092416103506, + "grad_norm": 0.9906991720199585, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.6153, + "step": 15097 + }, + { + "epoch": 0.21844856466239243, + "grad_norm": 0.8844815492630005, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.6113, + "step": 15128 + }, + { + "epoch": 0.21889620516374977, + "grad_norm": 1.0258604288101196, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.6111, + "step": 15159 + }, + { + "epoch": 0.21934384566510715, + "grad_norm": 0.8992047309875488, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6083, + "step": 15190 + }, + { + "epoch": 0.21979148616646452, + "grad_norm": 0.991061806678772, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.6219, + "step": 15221 + }, + { + "epoch": 0.22023912666782186, + "grad_norm": 0.9700108766555786, + "learning_rate": 2.708224532974953e-05, + "loss": 0.6119, + "step": 15252 + }, + { + "epoch": 0.22068676716917923, + "grad_norm": 0.904680609703064, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.6135, + "step": 15283 + }, + { + "epoch": 0.2211344076705366, + "grad_norm": 0.9015173316001892, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.611, + "step": 15314 + }, + { + "epoch": 0.22158204817189395, + "grad_norm": 1.020070195198059, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.617, + "step": 15345 + }, + { + "epoch": 0.22202968867325132, + "grad_norm": 1.0938650369644165, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.6135, + "step": 15376 + }, + { + "epoch": 0.22247732917460866, + "grad_norm": 1.2179347276687622, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.6108, + "step": 15407 + }, + { + "epoch": 0.22292496967596603, + "grad_norm": 0.9314635396003723, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6109, + "step": 15438 + }, + { + "epoch": 0.2233726101773234, + "grad_norm": 0.9413474798202515, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6114, + "step": 15469 + }, + { + "epoch": 0.22382025067868075, + "grad_norm": 1.0556674003601074, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.6114, + "step": 15500 + }, + { + "epoch": 0.22426789118003812, + "grad_norm": 0.9029526114463806, + "learning_rate": 2.632819298478939e-05, + "loss": 0.6152, + "step": 15531 + }, + { + "epoch": 0.2247155316813955, + "grad_norm": 1.0554165840148926, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.6077, + "step": 15562 + }, + { + "epoch": 0.22516317218275284, + "grad_norm": 0.9897674918174744, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.6099, + "step": 15593 + }, + { + "epoch": 0.2256108126841102, + "grad_norm": 1.036055326461792, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.6115, + "step": 15624 + }, + { + "epoch": 0.22605845318546758, + "grad_norm": 1.1050103902816772, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.6132, + "step": 15655 + }, + { + "epoch": 0.22650609368682492, + "grad_norm": 1.0429555177688599, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6114, + "step": 15686 + }, + { + "epoch": 0.2269537341881823, + "grad_norm": 0.8952310681343079, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6155, + "step": 15717 + }, + { + "epoch": 0.22740137468953966, + "grad_norm": 0.9422932267189026, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.6115, + "step": 15748 + }, + { + "epoch": 0.227849015190897, + "grad_norm": 0.8615415096282959, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.6147, + "step": 15779 + }, + { + "epoch": 0.22829665569225438, + "grad_norm": 1.0953892469406128, + "learning_rate": 2.557292666450159e-05, + "loss": 0.6141, + "step": 15810 + }, + { + "epoch": 0.22874429619361172, + "grad_norm": 1.038050651550293, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.6118, + "step": 15841 + }, + { + "epoch": 0.2291919366949691, + "grad_norm": 1.0005477666854858, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.6071, + "step": 15872 + }, + { + "epoch": 0.22963957719632647, + "grad_norm": 1.1400154829025269, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.6073, + "step": 15903 + }, + { + "epoch": 0.2300872176976838, + "grad_norm": 0.9465575218200684, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6139, + "step": 15934 + }, + { + "epoch": 0.23053485819904118, + "grad_norm": 0.9524116516113281, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.6112, + "step": 15965 + }, + { + "epoch": 0.23098249870039855, + "grad_norm": 1.0909959077835083, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6062, + "step": 15996 + }, + { + "epoch": 0.2314301392017559, + "grad_norm": 0.9520925879478455, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6066, + "step": 16027 + }, + { + "epoch": 0.23187777970311327, + "grad_norm": 0.9747080206871033, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.6073, + "step": 16058 + }, + { + "epoch": 0.23232542020447064, + "grad_norm": 0.8820034265518188, + "learning_rate": 2.481713668624899e-05, + "loss": 0.6042, + "step": 16089 + }, + { + "epoch": 0.23277306070582798, + "grad_norm": 0.873534619808197, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.6115, + "step": 16120 + }, + { + "epoch": 0.23322070120718535, + "grad_norm": 1.0529483556747437, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.604, + "step": 16151 + }, + { + "epoch": 0.23366834170854273, + "grad_norm": 1.0236808061599731, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6109, + "step": 16182 + }, + { + "epoch": 0.23411598220990007, + "grad_norm": 0.926750898361206, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.6103, + "step": 16213 + }, + { + "epoch": 0.23456362271125744, + "grad_norm": 0.9616347551345825, + "learning_rate": 2.439728136286796e-05, + "loss": 0.6115, + "step": 16244 + }, + { + "epoch": 0.23501126321261478, + "grad_norm": 0.9181815981864929, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.6093, + "step": 16275 + }, + { + "epoch": 0.23545890371397216, + "grad_norm": 1.08785879611969, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6056, + "step": 16306 + }, + { + "epoch": 0.23590654421532953, + "grad_norm": 1.3456270694732666, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5999, + "step": 16337 + }, + { + "epoch": 0.23635418471668687, + "grad_norm": 0.9001489281654358, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6147, + "step": 16368 + }, + { + "epoch": 0.23680182521804424, + "grad_norm": 0.9228829145431519, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6089, + "step": 16399 + }, + { + "epoch": 0.23724946571940161, + "grad_norm": 0.8185672760009766, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6064, + "step": 16430 + }, + { + "epoch": 0.23769710622075896, + "grad_norm": 0.848855197429657, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.6078, + "step": 16461 + }, + { + "epoch": 0.23814474672211633, + "grad_norm": 1.5285366773605347, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6016, + "step": 16492 + }, + { + "epoch": 0.2385923872234737, + "grad_norm": 1.0771571397781372, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.6132, + "step": 16523 + }, + { + "epoch": 0.23904002772483104, + "grad_norm": 0.8987991809844971, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.6098, + "step": 16554 + }, + { + "epoch": 0.23948766822618842, + "grad_norm": 0.8981488943099976, + "learning_rate": 2.347436487983929e-05, + "loss": 0.6168, + "step": 16585 + }, + { + "epoch": 0.2399353087275458, + "grad_norm": 0.9029120802879333, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.6116, + "step": 16616 + }, + { + "epoch": 0.24038294922890313, + "grad_norm": 1.0437650680541992, + "learning_rate": 2.330674878704035e-05, + "loss": 0.599, + "step": 16647 + }, + { + "epoch": 0.2408305897302605, + "grad_norm": 0.9616511464118958, + "learning_rate": 2.322296892997561e-05, + "loss": 0.614, + "step": 16678 + }, + { + "epoch": 0.24127823023161785, + "grad_norm": 0.8985153436660767, + "learning_rate": 2.313920912646497e-05, + "loss": 0.6087, + "step": 16709 + }, + { + "epoch": 0.24172587073297522, + "grad_norm": 1.0448508262634277, + "learning_rate": 2.305547032172643e-05, + "loss": 0.6062, + "step": 16740 + }, + { + "epoch": 0.2421735112343326, + "grad_norm": 0.9185760021209717, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6099, + "step": 16771 + }, + { + "epoch": 0.24262115173568993, + "grad_norm": 1.1951557397842407, + "learning_rate": 2.288805948824212e-05, + "loss": 0.6091, + "step": 16802 + }, + { + "epoch": 0.2430687922370473, + "grad_norm": 0.8947639465332031, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6101, + "step": 16833 + }, + { + "epoch": 0.24351643273840468, + "grad_norm": 1.0335516929626465, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6053, + "step": 16864 + }, + { + "epoch": 0.24396407323976202, + "grad_norm": 0.9719113111495972, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.6093, + "step": 16895 + }, + { + "epoch": 0.2444117137411194, + "grad_norm": 0.9017343521118164, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6096, + "step": 16926 + }, + { + "epoch": 0.24485935424247676, + "grad_norm": 0.9254065155982971, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6031, + "step": 16957 + }, + { + "epoch": 0.2453069947438341, + "grad_norm": 0.9550548195838928, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6136, + "step": 16988 + }, + { + "epoch": 0.24575463524519148, + "grad_norm": 0.913746178150177, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6041, + "step": 17019 + }, + { + "epoch": 0.24620227574654885, + "grad_norm": 1.0998092889785767, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6031, + "step": 17050 + }, + { + "epoch": 0.2466499162479062, + "grad_norm": 0.8995510339736938, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.6043, + "step": 17081 + }, + { + "epoch": 0.24709755674926356, + "grad_norm": 1.0682373046875, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6091, + "step": 17112 + }, + { + "epoch": 0.2475451972506209, + "grad_norm": 0.9658533334732056, + "learning_rate": 2.196920634473666e-05, + "loss": 0.6062, + "step": 17143 + }, + { + "epoch": 0.24799283775197828, + "grad_norm": 0.9547036290168762, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6083, + "step": 17174 + }, + { + "epoch": 0.24844047825333565, + "grad_norm": 1.1252254247665405, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6047, + "step": 17205 + }, + { + "epoch": 0.248888118754693, + "grad_norm": 0.8774239420890808, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6091, + "step": 17236 + }, + { + "epoch": 0.24933575925605037, + "grad_norm": 0.9929160475730896, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.6037, + "step": 17267 + }, + { + "epoch": 0.24978339975740774, + "grad_norm": 1.0022073984146118, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6106, + "step": 17298 + }, + { + "epoch": 0.2502310402587651, + "grad_norm": 0.9709188938140869, + "learning_rate": 2.146967792431106e-05, + "loss": 0.6043, + "step": 17329 + }, + { + "epoch": 0.25067868076012245, + "grad_norm": 0.9158416986465454, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6002, + "step": 17360 + }, + { + "epoch": 0.2511263212614798, + "grad_norm": 1.045093059539795, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.6027, + "step": 17391 + }, + { + "epoch": 0.2515739617628372, + "grad_norm": 0.9029024243354797, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6095, + "step": 17422 + }, + { + "epoch": 0.2520216022641945, + "grad_norm": 1.0147509574890137, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.6072, + "step": 17453 + }, + { + "epoch": 0.2524692427655519, + "grad_norm": 0.9794949293136597, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6048, + "step": 17484 + }, + { + "epoch": 0.25291688326690925, + "grad_norm": 1.186495304107666, + "learning_rate": 2.097158366805287e-05, + "loss": 0.6079, + "step": 17515 + }, + { + "epoch": 0.2533645237682666, + "grad_norm": 0.9781451225280762, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.6033, + "step": 17546 + }, + { + "epoch": 0.253812164269624, + "grad_norm": 0.9464316964149475, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6057, + "step": 17577 + }, + { + "epoch": 0.25425980477098137, + "grad_norm": 0.9456629753112793, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.6039, + "step": 17608 + }, + { + "epoch": 0.2547074452723387, + "grad_norm": 0.9119940400123596, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6059, + "step": 17639 + }, + { + "epoch": 0.25515508577369606, + "grad_norm": 1.1009196043014526, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6081, + "step": 17670 + }, + { + "epoch": 0.2556027262750534, + "grad_norm": 1.010501503944397, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6081, + "step": 17701 + }, + { + "epoch": 0.2560503667764108, + "grad_norm": 0.9617831707000732, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.6066, + "step": 17732 + }, + { + "epoch": 0.25649800727776817, + "grad_norm": 1.0574779510498047, + "learning_rate": 2.031003855589343e-05, + "loss": 0.6025, + "step": 17763 + }, + { + "epoch": 0.2569456477791255, + "grad_norm": 0.9515939950942993, + "learning_rate": 2.022757379528727e-05, + "loss": 0.6147, + "step": 17794 + }, + { + "epoch": 0.25739328828048286, + "grad_norm": 0.8629471659660339, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6018, + "step": 17825 + }, + { + "epoch": 0.25784092878184023, + "grad_norm": 1.0973188877105713, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5974, + "step": 17856 + }, + { + "epoch": 0.2582885692831976, + "grad_norm": 1.0111137628555298, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6007, + "step": 17887 + }, + { + "epoch": 0.25873620978455497, + "grad_norm": 0.938352644443512, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6062, + "step": 17918 + }, + { + "epoch": 0.25918385028591234, + "grad_norm": 0.8754394054412842, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5999, + "step": 17949 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.9272274374961853, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5958, + "step": 17980 + }, + { + "epoch": 0.26007913128862703, + "grad_norm": 0.9161437749862671, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6068, + "step": 18011 + }, + { + "epoch": 0.2605267717899844, + "grad_norm": 1.0073903799057007, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6075, + "step": 18042 + }, + { + "epoch": 0.2609744122913418, + "grad_norm": 0.8590899705886841, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6017, + "step": 18073 + }, + { + "epoch": 0.26142205279269914, + "grad_norm": 0.9320747256278992, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6025, + "step": 18104 + }, + { + "epoch": 0.2618696932940565, + "grad_norm": 1.000109076499939, + "learning_rate": 1.932422022132275e-05, + "loss": 0.6025, + "step": 18135 + }, + { + "epoch": 0.26231733379541383, + "grad_norm": 0.8880858421325684, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6029, + "step": 18166 + }, + { + "epoch": 0.2627649742967712, + "grad_norm": 0.9839984178543091, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6009, + "step": 18197 + }, + { + "epoch": 0.2632126147981286, + "grad_norm": 0.9712537527084351, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6005, + "step": 18228 + }, + { + "epoch": 0.26366025529948595, + "grad_norm": 0.9593982100486755, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5967, + "step": 18259 + }, + { + "epoch": 0.2641078958008433, + "grad_norm": 0.9100329279899597, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5926, + "step": 18290 + }, + { + "epoch": 0.26455553630220063, + "grad_norm": 0.8858036398887634, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5972, + "step": 18321 + }, + { + "epoch": 0.265003176803558, + "grad_norm": 1.0127744674682617, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.6029, + "step": 18352 + }, + { + "epoch": 0.2654508173049154, + "grad_norm": 0.9447472095489502, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.6066, + "step": 18383 + }, + { + "epoch": 0.26589845780627275, + "grad_norm": 0.818134605884552, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5953, + "step": 18414 + }, + { + "epoch": 0.2663460983076301, + "grad_norm": 0.9979908466339111, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5978, + "step": 18445 + }, + { + "epoch": 0.2667937388089875, + "grad_norm": 0.8588074445724487, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.6004, + "step": 18476 + }, + { + "epoch": 0.2672413793103448, + "grad_norm": 0.9643010497093201, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.6071, + "step": 18507 + }, + { + "epoch": 0.2676890198117022, + "grad_norm": 0.8365680575370789, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5998, + "step": 18538 + }, + { + "epoch": 0.26813666031305955, + "grad_norm": 0.8986954689025879, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5982, + "step": 18569 + }, + { + "epoch": 0.2685843008144169, + "grad_norm": 1.0341336727142334, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.6008, + "step": 18600 + }, + { + "epoch": 0.2690319413157743, + "grad_norm": 1.0030567646026611, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.6011, + "step": 18631 + }, + { + "epoch": 0.2694795818171316, + "grad_norm": 0.9385823607444763, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.6077, + "step": 18662 + }, + { + "epoch": 0.269927222318489, + "grad_norm": 0.8827871680259705, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.6073, + "step": 18693 + }, + { + "epoch": 0.27037486281984635, + "grad_norm": 0.9739916324615479, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.595, + "step": 18724 + }, + { + "epoch": 0.2708225033212037, + "grad_norm": 0.9108980298042297, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.587, + "step": 18755 + }, + { + "epoch": 0.2712701438225611, + "grad_norm": 1.0579863786697388, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.6043, + "step": 18786 + }, + { + "epoch": 0.27171778432391847, + "grad_norm": 0.9881874322891235, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5905, + "step": 18817 + }, + { + "epoch": 0.2721654248252758, + "grad_norm": 0.9560896158218384, + "learning_rate": 1.746186742108387e-05, + "loss": 0.6033, + "step": 18848 + }, + { + "epoch": 0.27261306532663315, + "grad_norm": 0.9506632685661316, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5935, + "step": 18879 + }, + { + "epoch": 0.2730607058279905, + "grad_norm": 0.9935999512672424, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5997, + "step": 18910 + }, + { + "epoch": 0.2735083463293479, + "grad_norm": 0.9102685451507568, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.5982, + "step": 18941 + }, + { + "epoch": 0.27395598683070527, + "grad_norm": 1.0436829328536987, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.603, + "step": 18972 + }, + { + "epoch": 0.27440362733206264, + "grad_norm": 1.0441209077835083, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.5957, + "step": 19003 + }, + { + "epoch": 0.27485126783341995, + "grad_norm": 0.9903119206428528, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5972, + "step": 19034 + }, + { + "epoch": 0.2752989083347773, + "grad_norm": 0.8914598226547241, + "learning_rate": 1.690348705220684e-05, + "loss": 0.6027, + "step": 19065 + }, + { + "epoch": 0.2757465488361347, + "grad_norm": 0.9678306579589844, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5999, + "step": 19096 + }, + { + "epoch": 0.27619418933749207, + "grad_norm": 0.9159491658210754, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.6029, + "step": 19127 + }, + { + "epoch": 0.27664182983884944, + "grad_norm": 1.0948030948638916, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.604, + "step": 19158 + }, + { + "epoch": 0.27708947034020676, + "grad_norm": 0.866001546382904, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.5863, + "step": 19189 + }, + { + "epoch": 0.2775371108415641, + "grad_norm": 1.0278693437576294, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5937, + "step": 19220 + }, + { + "epoch": 0.2779847513429215, + "grad_norm": 0.9074748158454895, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.602, + "step": 19251 + }, + { + "epoch": 0.27843239184427887, + "grad_norm": 0.9951406717300415, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5982, + "step": 19282 + }, + { + "epoch": 0.27888003234563624, + "grad_norm": 1.0565474033355713, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.6008, + "step": 19313 + }, + { + "epoch": 0.2793276728469936, + "grad_norm": 0.9266191124916077, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5994, + "step": 19344 + }, + { + "epoch": 0.27977531334835093, + "grad_norm": 0.8990464806556702, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5984, + "step": 19375 + }, + { + "epoch": 0.2802229538497083, + "grad_norm": 0.9231170415878296, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5952, + "step": 19406 + }, + { + "epoch": 0.28067059435106567, + "grad_norm": 1.035946011543274, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5985, + "step": 19437 + }, + { + "epoch": 0.28111823485242304, + "grad_norm": 0.8787546157836914, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5999, + "step": 19468 + }, + { + "epoch": 0.2815658753537804, + "grad_norm": 0.8387063145637512, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5973, + "step": 19499 + }, + { + "epoch": 0.28201351585513773, + "grad_norm": 1.025985598564148, + "learning_rate": 1.572242550298298e-05, + "loss": 0.597, + "step": 19530 + }, + { + "epoch": 0.2824611563564951, + "grad_norm": 0.9072343707084656, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5868, + "step": 19561 + }, + { + "epoch": 0.2829087968578525, + "grad_norm": 0.9914515614509583, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5926, + "step": 19592 + }, + { + "epoch": 0.28335643735920985, + "grad_norm": 0.9568142294883728, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.6023, + "step": 19623 + }, + { + "epoch": 0.2838040778605672, + "grad_norm": 0.8508808016777039, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5991, + "step": 19654 + }, + { + "epoch": 0.2842517183619246, + "grad_norm": 0.9583558440208435, + "learning_rate": 1.533382561658241e-05, + "loss": 0.6031, + "step": 19685 + }, + { + "epoch": 0.2846993588632819, + "grad_norm": 0.9079626798629761, + "learning_rate": 1.525642938149541e-05, + "loss": 0.6021, + "step": 19716 + }, + { + "epoch": 0.2851469993646393, + "grad_norm": 0.8839224576950073, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5974, + "step": 19747 + }, + { + "epoch": 0.28559463986599665, + "grad_norm": 0.9244747161865234, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5951, + "step": 19778 + }, + { + "epoch": 0.286042280367354, + "grad_norm": 0.937430202960968, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5981, + "step": 19809 + }, + { + "epoch": 0.2864899208687114, + "grad_norm": 1.0390359163284302, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.6017, + "step": 19840 + }, + { + "epoch": 0.28693756137006876, + "grad_norm": 0.9726883172988892, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.5956, + "step": 19871 + }, + { + "epoch": 0.2873852018714261, + "grad_norm": 0.8611225485801697, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5875, + "step": 19902 + }, + { + "epoch": 0.28783284237278345, + "grad_norm": 0.9599292278289795, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.5948, + "step": 19933 + }, + { + "epoch": 0.2882804828741408, + "grad_norm": 0.9473167061805725, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5967, + "step": 19964 + }, + { + "epoch": 0.2887281233754982, + "grad_norm": 0.9631912708282471, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.5962, + "step": 19995 + }, + { + "epoch": 0.28917576387685556, + "grad_norm": 0.9674975872039795, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.59, + "step": 20026 + }, + { + "epoch": 0.2896234043782129, + "grad_norm": 1.1209561824798584, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.6032, + "step": 20057 + }, + { + "epoch": 0.29007104487957025, + "grad_norm": 0.9360538125038147, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5975, + "step": 20088 + }, + { + "epoch": 0.2905186853809276, + "grad_norm": 0.9791879653930664, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.6051, + "step": 20119 + }, + { + "epoch": 0.290966325882285, + "grad_norm": 1.0199767351150513, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.5912, + "step": 20150 + }, + { + "epoch": 0.29141396638364236, + "grad_norm": 0.8803568482398987, + "learning_rate": 1.410916653306954e-05, + "loss": 0.6007, + "step": 20181 + }, + { + "epoch": 0.29186160688499974, + "grad_norm": 0.9544051289558411, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5901, + "step": 20212 + }, + { + "epoch": 0.29230924738635705, + "grad_norm": 1.0661756992340088, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.595, + "step": 20243 + }, + { + "epoch": 0.2927568878877144, + "grad_norm": 0.9343761801719666, + "learning_rate": 1.388293959928911e-05, + "loss": 0.6016, + "step": 20274 + }, + { + "epoch": 0.2932045283890718, + "grad_norm": 1.0200270414352417, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.6, + "step": 20305 + }, + { + "epoch": 0.29365216889042917, + "grad_norm": 0.8452933430671692, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5978, + "step": 20336 + }, + { + "epoch": 0.29409980939178654, + "grad_norm": 1.000546932220459, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5933, + "step": 20367 + }, + { + "epoch": 0.29454744989314385, + "grad_norm": 0.9047265648841858, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5954, + "step": 20398 + }, + { + "epoch": 0.2949950903945012, + "grad_norm": 0.877160906791687, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.5921, + "step": 20429 + }, + { + "epoch": 0.2954427308958586, + "grad_norm": 1.0855872631072998, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5992, + "step": 20460 + }, + { + "epoch": 0.29589037139721597, + "grad_norm": 1.006057620048523, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5916, + "step": 20491 + }, + { + "epoch": 0.29633801189857334, + "grad_norm": 1.003037929534912, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5918, + "step": 20522 + }, + { + "epoch": 0.2967856523999307, + "grad_norm": 0.9983749389648438, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5922, + "step": 20553 + }, + { + "epoch": 0.297233292901288, + "grad_norm": 1.0387030839920044, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5991, + "step": 20584 + }, + { + "epoch": 0.2976809334026454, + "grad_norm": 0.8586576581001282, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.6001, + "step": 20615 + }, + { + "epoch": 0.29812857390400277, + "grad_norm": 0.9596696496009827, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.5988, + "step": 20646 + }, + { + "epoch": 0.29857621440536014, + "grad_norm": 0.9584054350852966, + "learning_rate": 1.291596270869846e-05, + "loss": 0.5946, + "step": 20677 + }, + { + "epoch": 0.2990238549067175, + "grad_norm": 1.02814519405365, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5941, + "step": 20708 + }, + { + "epoch": 0.2994714954080749, + "grad_norm": 1.1163685321807861, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.6002, + "step": 20739 + }, + { + "epoch": 0.2999191359094322, + "grad_norm": 0.9234864711761475, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.5919, + "step": 20770 + }, + { + "epoch": 0.30036677641078957, + "grad_norm": 0.9402379393577576, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5931, + "step": 20801 + }, + { + "epoch": 0.30081441691214694, + "grad_norm": 0.9681121110916138, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5904, + "step": 20832 + }, + { + "epoch": 0.3012620574135043, + "grad_norm": 0.9508892893791199, + "learning_rate": 1.247732733176724e-05, + "loss": 0.5997, + "step": 20863 + }, + { + "epoch": 0.3017096979148617, + "grad_norm": 0.8664924502372742, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.594, + "step": 20894 + }, + { + "epoch": 0.302157338416219, + "grad_norm": 0.8919743299484253, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.5873, + "step": 20925 + }, + { + "epoch": 0.3026049789175764, + "grad_norm": 0.9090976119041443, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5966, + "step": 20956 + }, + { + "epoch": 0.30305261941893374, + "grad_norm": 0.8878434896469116, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5939, + "step": 20987 + }, + { + "epoch": 0.3035002599202911, + "grad_norm": 0.8897850513458252, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.5978, + "step": 21018 + }, + { + "epoch": 0.3039479004216485, + "grad_norm": 0.8866651654243469, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.5873, + "step": 21049 + }, + { + "epoch": 0.30439554092300586, + "grad_norm": 0.8519348502159119, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5991, + "step": 21080 + }, + { + "epoch": 0.3048431814243632, + "grad_norm": 1.146201252937317, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5928, + "step": 21111 + }, + { + "epoch": 0.30529082192572055, + "grad_norm": 1.0777043104171753, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5966, + "step": 21142 + }, + { + "epoch": 0.3057384624270779, + "grad_norm": 1.0404378175735474, + "learning_rate": 1.175766039353062e-05, + "loss": 0.607, + "step": 21173 + }, + { + "epoch": 0.3061861029284353, + "grad_norm": 0.8684154152870178, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.5992, + "step": 21204 + }, + { + "epoch": 0.30663374342979266, + "grad_norm": 0.9449039101600647, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.5937, + "step": 21235 + }, + { + "epoch": 0.30708138393115, + "grad_norm": 0.9459973573684692, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5953, + "step": 21266 + }, + { + "epoch": 0.30752902443250735, + "grad_norm": 0.8467513918876648, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5937, + "step": 21297 + }, + { + "epoch": 0.3079766649338647, + "grad_norm": 0.953157901763916, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.592, + "step": 21328 + }, + { + "epoch": 0.3084243054352221, + "grad_norm": 0.9762019515037537, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.6009, + "step": 21359 + }, + { + "epoch": 0.30887194593657946, + "grad_norm": 0.7931903600692749, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.5951, + "step": 21390 + }, + { + "epoch": 0.30931958643793683, + "grad_norm": 1.042128324508667, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5951, + "step": 21421 + }, + { + "epoch": 0.30976722693929415, + "grad_norm": 0.9942502379417419, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5945, + "step": 21452 + }, + { + "epoch": 0.3102148674406515, + "grad_norm": 0.9771155118942261, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5929, + "step": 21483 + }, + { + "epoch": 0.3106625079420089, + "grad_norm": 0.9390444159507751, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5944, + "step": 21514 + }, + { + "epoch": 0.31111014844336626, + "grad_norm": 0.9289217591285706, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5902, + "step": 21545 + }, + { + "epoch": 0.31155778894472363, + "grad_norm": 0.8977670669555664, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5951, + "step": 21576 + }, + { + "epoch": 0.312005429446081, + "grad_norm": 0.9898940920829773, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5997, + "step": 21607 + }, + { + "epoch": 0.3124530699474383, + "grad_norm": 0.9756447076797485, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5969, + "step": 21638 + }, + { + "epoch": 0.3129007104487957, + "grad_norm": 1.0475540161132812, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5908, + "step": 21669 + }, + { + "epoch": 0.31334835095015307, + "grad_norm": 0.9765790700912476, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5903, + "step": 21700 + }, + { + "epoch": 0.31379599145151044, + "grad_norm": 0.8677023649215698, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5907, + "step": 21731 + }, + { + "epoch": 0.3142436319528678, + "grad_norm": 0.937731921672821, + "learning_rate": 1.043211714185722e-05, + "loss": 0.603, + "step": 21762 + }, + { + "epoch": 0.3146912724542251, + "grad_norm": 0.8233932256698608, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5955, + "step": 21793 + }, + { + "epoch": 0.3151389129555825, + "grad_norm": 1.1260769367218018, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.605, + "step": 21824 + }, + { + "epoch": 0.31558655345693987, + "grad_norm": 0.9366801977157593, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.5963, + "step": 21855 + }, + { + "epoch": 0.31603419395829724, + "grad_norm": 0.871155321598053, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5876, + "step": 21886 + }, + { + "epoch": 0.3164818344596546, + "grad_norm": 0.9532550573348999, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5863, + "step": 21917 + }, + { + "epoch": 0.316929474961012, + "grad_norm": 1.045569658279419, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5899, + "step": 21948 + }, + { + "epoch": 0.3173771154623693, + "grad_norm": 0.9291980862617493, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5904, + "step": 21979 + }, + { + "epoch": 0.31782475596372667, + "grad_norm": 0.8881365060806274, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5997, + "step": 22010 + }, + { + "epoch": 0.31827239646508404, + "grad_norm": 0.9601870179176331, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5965, + "step": 22041 + }, + { + "epoch": 0.3187200369664414, + "grad_norm": 0.8519198298454285, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5887, + "step": 22072 + }, + { + "epoch": 0.3191676774677988, + "grad_norm": 0.9262305498123169, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5846, + "step": 22103 + }, + { + "epoch": 0.3196153179691561, + "grad_norm": 1.0259448289871216, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5865, + "step": 22134 + }, + { + "epoch": 0.32006295847051347, + "grad_norm": 0.9057780504226685, + "learning_rate": 9.559031610258007e-06, + "loss": 0.592, + "step": 22165 + }, + { + "epoch": 0.32051059897187084, + "grad_norm": 0.905604362487793, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5958, + "step": 22196 + }, + { + "epoch": 0.3209582394732282, + "grad_norm": 0.8837811946868896, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5955, + "step": 22227 + }, + { + "epoch": 0.3214058799745856, + "grad_norm": 0.9125133752822876, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5915, + "step": 22258 + }, + { + "epoch": 0.32185352047594296, + "grad_norm": 0.8553541898727417, + "learning_rate": 9.29623762843734e-06, + "loss": 0.586, + "step": 22289 + }, + { + "epoch": 0.32230116097730027, + "grad_norm": 0.8609781861305237, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5867, + "step": 22320 + }, + { + "epoch": 0.32274880147865764, + "grad_norm": 0.8896780014038086, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5862, + "step": 22351 + }, + { + "epoch": 0.323196441980015, + "grad_norm": 1.0318437814712524, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5906, + "step": 22382 + }, + { + "epoch": 0.3236440824813724, + "grad_norm": 1.0346667766571045, + "learning_rate": 9.036279043803565e-06, + "loss": 0.594, + "step": 22413 + }, + { + "epoch": 0.32409172298272976, + "grad_norm": 0.899023175239563, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5983, + "step": 22444 + }, + { + "epoch": 0.32453936348408713, + "grad_norm": 0.8427733182907104, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5928, + "step": 22475 + }, + { + "epoch": 0.32498700398544444, + "grad_norm": 0.8469851613044739, + "learning_rate": 8.843199089695293e-06, + "loss": 0.5867, + "step": 22506 + }, + { + "epoch": 0.3254346444868018, + "grad_norm": 0.8703016638755798, + "learning_rate": 8.779202793251311e-06, + "loss": 0.5894, + "step": 22537 + }, + { + "epoch": 0.3258822849881592, + "grad_norm": 0.9438649415969849, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5949, + "step": 22568 + }, + { + "epoch": 0.32632992548951656, + "grad_norm": 0.8361387848854065, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5956, + "step": 22599 + }, + { + "epoch": 0.32677756599087393, + "grad_norm": 0.8810434341430664, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5904, + "step": 22630 + }, + { + "epoch": 0.32722520649223125, + "grad_norm": 0.9140039682388306, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5871, + "step": 22661 + }, + { + "epoch": 0.3276728469935886, + "grad_norm": 0.8848084211349487, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5973, + "step": 22692 + }, + { + "epoch": 0.328120487494946, + "grad_norm": 0.8551177382469177, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5925, + "step": 22723 + }, + { + "epoch": 0.32856812799630336, + "grad_norm": 0.9873132705688477, + "learning_rate": 8.336394285228017e-06, + "loss": 0.592, + "step": 22754 + }, + { + "epoch": 0.32901576849766073, + "grad_norm": 0.9582761526107788, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5957, + "step": 22785 + }, + { + "epoch": 0.3294634089990181, + "grad_norm": 0.9337429404258728, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5893, + "step": 22816 + }, + { + "epoch": 0.3299110495003754, + "grad_norm": 0.8926681280136108, + "learning_rate": 8.149425683199823e-06, + "loss": 0.593, + "step": 22847 + }, + { + "epoch": 0.3303586900017328, + "grad_norm": 0.8568328022956848, + "learning_rate": 8.08748219313325e-06, + "loss": 0.5895, + "step": 22878 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 7630, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6887420591349432e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-22890/training_args.bin b/checkpoint-22890/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542 --- /dev/null +++ b/checkpoint-22890/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3 +size 5432 diff --git a/checkpoint-30517/config.json b/checkpoint-30517/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09 --- /dev/null +++ b/checkpoint-30517/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.47.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-30517/generation_config.json b/checkpoint-30517/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507 --- /dev/null +++ b/checkpoint-30517/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.47.0.dev0" +} diff --git a/checkpoint-30517/model-00001-of-00007.safetensors b/checkpoint-30517/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..542f9d7381f168eb0b0a63a75a9adf93a5deee06 --- /dev/null +++ b/checkpoint-30517/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13878f97ef55d85d9b352c717dba406c909afe1bae3e88a6a4424a428c0bccc6 +size 4886466168 diff --git a/checkpoint-30517/model-00002-of-00007.safetensors b/checkpoint-30517/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961 --- /dev/null +++ b/checkpoint-30517/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64 +size 4832007448 diff --git a/checkpoint-30517/model-00003-of-00007.safetensors b/checkpoint-30517/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff --- /dev/null +++ b/checkpoint-30517/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97 +size 4999813112 diff --git a/checkpoint-30517/model-00004-of-00007.safetensors b/checkpoint-30517/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a --- /dev/null +++ b/checkpoint-30517/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042 +size 4999813128 diff --git a/checkpoint-30517/model-00005-of-00007.safetensors b/checkpoint-30517/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89 --- /dev/null +++ b/checkpoint-30517/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7 +size 4832007496 diff --git a/checkpoint-30517/model-00006-of-00007.safetensors b/checkpoint-30517/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..524e69fde1febd3510b4725ea7cfc5103d79e8d1 --- /dev/null +++ b/checkpoint-30517/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf01cd5fe74bfd002b701e409ad8e68a02b64c59f6cfb2302b9427953c464f3 +size 4999813120 diff --git a/checkpoint-30517/model-00007-of-00007.safetensors b/checkpoint-30517/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b73e0db4c4b0097276bbc78a364a5ff57002d272 --- /dev/null +++ b/checkpoint-30517/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9dd6df4fc7f009aa39a3b019a55cb97a8509bb44690419b3255311faaf9e89b +size 2571158184 diff --git a/checkpoint-30517/model.safetensors.index.json b/checkpoint-30517/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-30517/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-30517/optimizer.pt b/checkpoint-30517/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..82e8d1479868d74224afb62a58ff9a716f173ee2 --- /dev/null +++ b/checkpoint-30517/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78882a3bd9b13b3cc04e81dbcb7a5549988fdbd8420fa4d16b5bf8114af0a37e +size 15385036334 diff --git a/checkpoint-30517/rng_state.pth b/checkpoint-30517/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c --- /dev/null +++ b/checkpoint-30517/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69 +size 14244 diff --git a/checkpoint-30517/scheduler.pt b/checkpoint-30517/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d585b62f4a9bc8119a4f1d03cf2bb269e99411b --- /dev/null +++ b/checkpoint-30517/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d91c597306f26392e8da0d33a09fdcca77cfdeb5ad42248045521772fa3d64e +size 1064 diff --git a/checkpoint-30517/trainer_state.json b/checkpoint-30517/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..70ec5cf1ca59042ec3c40d0130e41247e284e3a1 --- /dev/null +++ b/checkpoint-30517/trainer_state.json @@ -0,0 +1,6921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4406659735458904, + "eval_steps": 500, + "global_step": 30517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004476405013573615, + "grad_norm": 4.6696085929870605, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9366, + "step": 31 + }, + { + "epoch": 0.000895281002714723, + "grad_norm": 4.250915050506592, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.9002, + "step": 62 + }, + { + "epoch": 0.0013429215040720846, + "grad_norm": 4.424270153045654, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8843, + "step": 93 + }, + { + "epoch": 0.001790562005429446, + "grad_norm": 4.56964635848999, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8717, + "step": 124 + }, + { + "epoch": 0.0022382025067868077, + "grad_norm": 4.051624298095703, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8711, + "step": 155 + }, + { + "epoch": 0.002685843008144169, + "grad_norm": 3.98006272315979, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8628, + "step": 186 + }, + { + "epoch": 0.0031334835095015307, + "grad_norm": 4.4158406257629395, + "learning_rate": 7.110091743119267e-06, + "loss": 0.871, + "step": 217 + }, + { + "epoch": 0.003581124010858892, + "grad_norm": 4.681333541870117, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8593, + "step": 248 + }, + { + "epoch": 0.004028764512216254, + "grad_norm": 3.8057820796966553, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8558, + "step": 279 + }, + { + "epoch": 0.0044764050135736155, + "grad_norm": 4.523633003234863, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8676, + "step": 310 + }, + { + "epoch": 0.0049240455149309765, + "grad_norm": 3.7387187480926514, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8585, + "step": 341 + }, + { + "epoch": 0.005371686016288338, + "grad_norm": 4.187750816345215, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8592, + "step": 372 + }, + { + "epoch": 0.005819326517645699, + "grad_norm": 3.782883644104004, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8449, + "step": 403 + }, + { + "epoch": 0.006266967019003061, + "grad_norm": 3.577796459197998, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8418, + "step": 434 + }, + { + "epoch": 0.006714607520360423, + "grad_norm": 3.1408321857452393, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8577, + "step": 465 + }, + { + "epoch": 0.007162248021717784, + "grad_norm": 4.090081691741943, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8439, + "step": 496 + }, + { + "epoch": 0.007609888523075146, + "grad_norm": 2.7458200454711914, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8468, + "step": 527 + }, + { + "epoch": 0.008057529024432507, + "grad_norm": 3.703225612640381, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8385, + "step": 558 + }, + { + "epoch": 0.008505169525789868, + "grad_norm": 3.134650230407715, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8418, + "step": 589 + }, + { + "epoch": 0.008952810027147231, + "grad_norm": 3.762680768966675, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8312, + "step": 620 + }, + { + "epoch": 0.009400450528504592, + "grad_norm": 3.751004457473755, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8251, + "step": 651 + }, + { + "epoch": 0.009848091029861953, + "grad_norm": 3.2268712520599365, + "learning_rate": 2.234600262123198e-05, + "loss": 0.8369, + "step": 682 + }, + { + "epoch": 0.010295731531219316, + "grad_norm": 3.5854289531707764, + "learning_rate": 2.336173001310616e-05, + "loss": 0.826, + "step": 713 + }, + { + "epoch": 0.010743372032576677, + "grad_norm": 3.9910435676574707, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8168, + "step": 744 + }, + { + "epoch": 0.011191012533934038, + "grad_norm": 3.3059303760528564, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.011638653035291399, + "grad_norm": 3.4081811904907227, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.817, + "step": 806 + }, + { + "epoch": 0.012086293536648762, + "grad_norm": 3.2740163803100586, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8195, + "step": 837 + }, + { + "epoch": 0.012533934038006123, + "grad_norm": 2.7206223011016846, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8188, + "step": 868 + }, + { + "epoch": 0.012981574539363484, + "grad_norm": 2.7005629539489746, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8127, + "step": 899 + }, + { + "epoch": 0.013429215040720846, + "grad_norm": 2.970745325088501, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8126, + "step": 930 + }, + { + "epoch": 0.013876855542078207, + "grad_norm": 2.4761953353881836, + "learning_rate": 3.148754914809961e-05, + "loss": 0.82, + "step": 961 + }, + { + "epoch": 0.014324496043435568, + "grad_norm": 2.8555397987365723, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8166, + "step": 992 + }, + { + "epoch": 0.01477213654479293, + "grad_norm": 2.8124194145202637, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8057, + "step": 1023 + }, + { + "epoch": 0.015219777046150292, + "grad_norm": 2.353851556777954, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8064, + "step": 1054 + }, + { + "epoch": 0.015667417547507653, + "grad_norm": 3.0127620697021484, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8086, + "step": 1085 + }, + { + "epoch": 0.016115058048865014, + "grad_norm": 2.792686939239502, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.8152, + "step": 1116 + }, + { + "epoch": 0.016562698550222375, + "grad_norm": 2.407134532928467, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7949, + "step": 1147 + }, + { + "epoch": 0.017010339051579736, + "grad_norm": 2.6921393871307373, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.804, + "step": 1178 + }, + { + "epoch": 0.0174579795529371, + "grad_norm": 2.3015975952148438, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7944, + "step": 1209 + }, + { + "epoch": 0.017905620054294462, + "grad_norm": 2.8116579055786133, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7977, + "step": 1240 + }, + { + "epoch": 0.018353260555651823, + "grad_norm": 2.5720036029815674, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7854, + "step": 1271 + }, + { + "epoch": 0.018800901057009184, + "grad_norm": 2.0802650451660156, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7892, + "step": 1302 + }, + { + "epoch": 0.019248541558366545, + "grad_norm": 2.4343624114990234, + "learning_rate": 4.367627785058978e-05, + "loss": 0.7897, + "step": 1333 + }, + { + "epoch": 0.019696182059723906, + "grad_norm": 2.509686231613159, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7855, + "step": 1364 + }, + { + "epoch": 0.020143822561081267, + "grad_norm": 2.626512289047241, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7873, + "step": 1395 + }, + { + "epoch": 0.02059146306243863, + "grad_norm": 2.8619399070739746, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7891, + "step": 1426 + }, + { + "epoch": 0.021039103563795993, + "grad_norm": 2.724792718887329, + "learning_rate": 4.77391874180865e-05, + "loss": 0.782, + "step": 1457 + }, + { + "epoch": 0.021486744065153354, + "grad_norm": 2.6659562587738037, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7856, + "step": 1488 + }, + { + "epoch": 0.021934384566510715, + "grad_norm": 2.646078586578369, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7748, + "step": 1519 + }, + { + "epoch": 0.022382025067868076, + "grad_norm": 2.429288387298584, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7722, + "step": 1550 + }, + { + "epoch": 0.022829665569225437, + "grad_norm": 1.9933409690856934, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7874, + "step": 1581 + }, + { + "epoch": 0.023277306070582798, + "grad_norm": 2.314889907836914, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7757, + "step": 1612 + }, + { + "epoch": 0.023724946571940162, + "grad_norm": 2.2891199588775635, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7713, + "step": 1643 + }, + { + "epoch": 0.024172587073297523, + "grad_norm": 2.4892444610595703, + "learning_rate": 4.999678487776908e-05, + "loss": 0.7687, + "step": 1674 + }, + { + "epoch": 0.024620227574654884, + "grad_norm": 2.3015685081481934, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7721, + "step": 1705 + }, + { + "epoch": 0.025067868076012245, + "grad_norm": 2.278954029083252, + "learning_rate": 4.999352703566763e-05, + "loss": 0.7741, + "step": 1736 + }, + { + "epoch": 0.025515508577369606, + "grad_norm": 1.7260370254516602, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7681, + "step": 1767 + }, + { + "epoch": 0.025963149078726967, + "grad_norm": 2.0179309844970703, + "learning_rate": 4.998914100252672e-05, + "loss": 0.7604, + "step": 1798 + }, + { + "epoch": 0.02641078958008433, + "grad_norm": 2.53022837638855, + "learning_rate": 4.998652497419696e-05, + "loss": 0.7598, + "step": 1829 + }, + { + "epoch": 0.026858430081441693, + "grad_norm": 1.859253168106079, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7606, + "step": 1860 + }, + { + "epoch": 0.027306070582799054, + "grad_norm": 1.759303331375122, + "learning_rate": 4.998044704162613e-05, + "loss": 0.7532, + "step": 1891 + }, + { + "epoch": 0.027753711084156415, + "grad_norm": 2.4389419555664062, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.7646, + "step": 1922 + }, + { + "epoch": 0.028201351585513776, + "grad_norm": 2.155348777770996, + "learning_rate": 4.997324150843799e-05, + "loss": 0.7569, + "step": 1953 + }, + { + "epoch": 0.028648992086871137, + "grad_norm": 2.0138537883758545, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7677, + "step": 1984 + }, + { + "epoch": 0.029096632588228498, + "grad_norm": 2.5275282859802246, + "learning_rate": 4.996490869988546e-05, + "loss": 0.7519, + "step": 2015 + }, + { + "epoch": 0.02954427308958586, + "grad_norm": 1.8147333860397339, + "learning_rate": 4.996031968290326e-05, + "loss": 0.7509, + "step": 2046 + }, + { + "epoch": 0.029991913590943223, + "grad_norm": 2.1941769123077393, + "learning_rate": 4.995544899210594e-05, + "loss": 0.754, + "step": 2077 + }, + { + "epoch": 0.030439554092300584, + "grad_norm": 1.8953059911727905, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.747, + "step": 2108 + }, + { + "epoch": 0.030887194593657945, + "grad_norm": 3.3973031044006348, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7513, + "step": 2139 + }, + { + "epoch": 0.031334835095015307, + "grad_norm": 2.66795015335083, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.7469, + "step": 2170 + }, + { + "epoch": 0.03178247559637267, + "grad_norm": 1.6254230737686157, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.7297, + "step": 2201 + }, + { + "epoch": 0.03223011609773003, + "grad_norm": 1.822745680809021, + "learning_rate": 4.992687246588743e-05, + "loss": 0.754, + "step": 2232 + }, + { + "epoch": 0.03267775659908739, + "grad_norm": 1.6898781061172485, + "learning_rate": 4.992031299767347e-05, + "loss": 0.7478, + "step": 2263 + }, + { + "epoch": 0.03312539710044475, + "grad_norm": 1.799280047416687, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.746, + "step": 2294 + }, + { + "epoch": 0.033573037601802115, + "grad_norm": 2.2501840591430664, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.7408, + "step": 2325 + }, + { + "epoch": 0.03402067810315947, + "grad_norm": 2.3315324783325195, + "learning_rate": 4.989894757091861e-05, + "loss": 0.7301, + "step": 2356 + }, + { + "epoch": 0.03446831860451684, + "grad_norm": 1.5820438861846924, + "learning_rate": 4.989126368919158e-05, + "loss": 0.7305, + "step": 2387 + }, + { + "epoch": 0.0349159591058742, + "grad_norm": 2.5696022510528564, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7337, + "step": 2418 + }, + { + "epoch": 0.03536359960723156, + "grad_norm": 1.8880938291549683, + "learning_rate": 4.987505333203608e-05, + "loss": 0.7385, + "step": 2449 + }, + { + "epoch": 0.035811240108588924, + "grad_norm": 2.6148738861083984, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.7292, + "step": 2480 + }, + { + "epoch": 0.03625888060994628, + "grad_norm": 1.6925290822982788, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.7344, + "step": 2511 + }, + { + "epoch": 0.036706521111303646, + "grad_norm": 1.7675210237503052, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.7354, + "step": 2542 + }, + { + "epoch": 0.037154161612661, + "grad_norm": 2.1053173542022705, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.7272, + "step": 2573 + }, + { + "epoch": 0.03760180211401837, + "grad_norm": 1.9718347787857056, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.7336, + "step": 2604 + }, + { + "epoch": 0.03804944261537573, + "grad_norm": 1.5777671337127686, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7182, + "step": 2635 + }, + { + "epoch": 0.03849708311673309, + "grad_norm": 1.905127763748169, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7296, + "step": 2666 + }, + { + "epoch": 0.038944723618090454, + "grad_norm": 1.63962721824646, + "learning_rate": 4.979899154855234e-05, + "loss": 0.7249, + "step": 2697 + }, + { + "epoch": 0.03939236411944781, + "grad_norm": 1.584331750869751, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.7345, + "step": 2728 + }, + { + "epoch": 0.039840004620805176, + "grad_norm": 1.9111014604568481, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7225, + "step": 2759 + }, + { + "epoch": 0.040287645122162534, + "grad_norm": 2.3216073513031006, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7207, + "step": 2790 + }, + { + "epoch": 0.0407352856235199, + "grad_norm": 1.6002410650253296, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7228, + "step": 2821 + }, + { + "epoch": 0.04118292612487726, + "grad_norm": 2.104731798171997, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7127, + "step": 2852 + }, + { + "epoch": 0.04163056662623462, + "grad_norm": 1.7114660739898682, + "learning_rate": 4.973018858007122e-05, + "loss": 0.7283, + "step": 2883 + }, + { + "epoch": 0.042078207127591985, + "grad_norm": 1.948133945465088, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7209, + "step": 2914 + }, + { + "epoch": 0.04252584762894934, + "grad_norm": 1.621764898300171, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7272, + "step": 2945 + }, + { + "epoch": 0.04297348813030671, + "grad_norm": 1.6967558860778809, + "learning_rate": 4.969201777632205e-05, + "loss": 0.7191, + "step": 2976 + }, + { + "epoch": 0.043421128631664065, + "grad_norm": 1.6656996011734009, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7205, + "step": 3007 + }, + { + "epoch": 0.04386876913302143, + "grad_norm": 2.151475191116333, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7168, + "step": 3038 + }, + { + "epoch": 0.044316409634378794, + "grad_norm": 2.213109016418457, + "learning_rate": 4.965133917685858e-05, + "loss": 0.7139, + "step": 3069 + }, + { + "epoch": 0.04476405013573615, + "grad_norm": 1.5380377769470215, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7237, + "step": 3100 + }, + { + "epoch": 0.045211690637093516, + "grad_norm": 2.312377452850342, + "learning_rate": 4.962282892045718e-05, + "loss": 0.7156, + "step": 3131 + }, + { + "epoch": 0.04565933113845087, + "grad_norm": 1.7220717668533325, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.7122, + "step": 3162 + }, + { + "epoch": 0.04610697163980824, + "grad_norm": 1.802856206893921, + "learning_rate": 4.959320720608049e-05, + "loss": 0.7128, + "step": 3193 + }, + { + "epoch": 0.046554612141165595, + "grad_norm": 1.6629964113235474, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.7172, + "step": 3224 + }, + { + "epoch": 0.04700225264252296, + "grad_norm": 3.440115213394165, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7213, + "step": 3255 + }, + { + "epoch": 0.047449893143880324, + "grad_norm": 1.5721139907836914, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7148, + "step": 3286 + }, + { + "epoch": 0.04789753364523768, + "grad_norm": 2.0920398235321045, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7145, + "step": 3317 + }, + { + "epoch": 0.048345174146595046, + "grad_norm": 1.666566014289856, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7095, + "step": 3348 + }, + { + "epoch": 0.048792814647952404, + "grad_norm": 1.8222129344940186, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7138, + "step": 3379 + }, + { + "epoch": 0.04924045514930977, + "grad_norm": 1.7302964925765991, + "learning_rate": 4.948079823064559e-05, + "loss": 0.7017, + "step": 3410 + }, + { + "epoch": 0.049688095650667126, + "grad_norm": 1.7338463068008423, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6979, + "step": 3441 + }, + { + "epoch": 0.05013573615202449, + "grad_norm": 1.5637450218200684, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7011, + "step": 3472 + }, + { + "epoch": 0.050583376653381855, + "grad_norm": 1.5632222890853882, + "learning_rate": 4.942847531574167e-05, + "loss": 0.704, + "step": 3503 + }, + { + "epoch": 0.05103101715473921, + "grad_norm": 1.588402509689331, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7011, + "step": 3534 + }, + { + "epoch": 0.05147865765609658, + "grad_norm": 1.8840582370758057, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7016, + "step": 3565 + }, + { + "epoch": 0.051926298157453935, + "grad_norm": 1.2702268362045288, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7004, + "step": 3596 + }, + { + "epoch": 0.0523739386588113, + "grad_norm": 1.3812692165374756, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7046, + "step": 3627 + }, + { + "epoch": 0.05282157916016866, + "grad_norm": 1.7257345914840698, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6976, + "step": 3658 + }, + { + "epoch": 0.05326921966152602, + "grad_norm": 1.7458925247192383, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6997, + "step": 3689 + }, + { + "epoch": 0.053716860162883386, + "grad_norm": 2.1996099948883057, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6991, + "step": 3720 + }, + { + "epoch": 0.05416450066424074, + "grad_norm": 1.6615021228790283, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7005, + "step": 3751 + }, + { + "epoch": 0.05461214116559811, + "grad_norm": 1.6276952028274536, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7097, + "step": 3782 + }, + { + "epoch": 0.055059781666955465, + "grad_norm": 1.758227825164795, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6955, + "step": 3813 + }, + { + "epoch": 0.05550742216831283, + "grad_norm": 1.195280909538269, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7073, + "step": 3844 + }, + { + "epoch": 0.05595506266967019, + "grad_norm": 1.6281015872955322, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7022, + "step": 3875 + }, + { + "epoch": 0.05640270317102755, + "grad_norm": 1.3543150424957275, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7036, + "step": 3906 + }, + { + "epoch": 0.056850343672384916, + "grad_norm": 2.16947078704834, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7007, + "step": 3937 + }, + { + "epoch": 0.057297984173742274, + "grad_norm": 1.324578881263733, + "learning_rate": 4.912976038673786e-05, + "loss": 0.6941, + "step": 3968 + }, + { + "epoch": 0.05774562467509964, + "grad_norm": 1.9811108112335205, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6908, + "step": 3999 + }, + { + "epoch": 0.058193265176456996, + "grad_norm": 1.2975554466247559, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6936, + "step": 4030 + }, + { + "epoch": 0.05864090567781436, + "grad_norm": 1.583282232284546, + "learning_rate": 4.906263980464644e-05, + "loss": 0.698, + "step": 4061 + }, + { + "epoch": 0.05908854617917172, + "grad_norm": 1.3532944917678833, + "learning_rate": 4.903972285033178e-05, + "loss": 0.7049, + "step": 4092 + }, + { + "epoch": 0.05953618668052908, + "grad_norm": 2.1245481967926025, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7016, + "step": 4123 + }, + { + "epoch": 0.05998382718188645, + "grad_norm": 1.6913797855377197, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6981, + "step": 4154 + }, + { + "epoch": 0.060431467683243804, + "grad_norm": 1.51249098777771, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6955, + "step": 4185 + }, + { + "epoch": 0.06087910818460117, + "grad_norm": 1.3880395889282227, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6934, + "step": 4216 + }, + { + "epoch": 0.061326748685958526, + "grad_norm": 1.6354159116744995, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6938, + "step": 4247 + }, + { + "epoch": 0.06177438918731589, + "grad_norm": 2.126742362976074, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7003, + "step": 4278 + }, + { + "epoch": 0.06222202968867325, + "grad_norm": 1.7903707027435303, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6885, + "step": 4309 + }, + { + "epoch": 0.06266967019003061, + "grad_norm": 1.537806510925293, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6892, + "step": 4340 + }, + { + "epoch": 0.06311731069138797, + "grad_norm": 1.6445434093475342, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6843, + "step": 4371 + }, + { + "epoch": 0.06356495119274534, + "grad_norm": 1.555373191833496, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6899, + "step": 4402 + }, + { + "epoch": 0.0640125916941027, + "grad_norm": 1.8370277881622314, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6813, + "step": 4433 + }, + { + "epoch": 0.06446023219546006, + "grad_norm": 1.3132514953613281, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6832, + "step": 4464 + }, + { + "epoch": 0.06490787269681741, + "grad_norm": 1.3186298608779907, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6879, + "step": 4495 + }, + { + "epoch": 0.06535551319817479, + "grad_norm": 1.4360268115997314, + "learning_rate": 4.869052379269719e-05, + "loss": 0.69, + "step": 4526 + }, + { + "epoch": 0.06580315369953214, + "grad_norm": 1.670765995979309, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6865, + "step": 4557 + }, + { + "epoch": 0.0662507942008895, + "grad_norm": 1.7548723220825195, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6852, + "step": 4588 + }, + { + "epoch": 0.06669843470224687, + "grad_norm": 1.5086426734924316, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6894, + "step": 4619 + }, + { + "epoch": 0.06714607520360423, + "grad_norm": 1.3140665292739868, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6812, + "step": 4650 + }, + { + "epoch": 0.06759371570496159, + "grad_norm": 1.4212454557418823, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6872, + "step": 4681 + }, + { + "epoch": 0.06804135620631895, + "grad_norm": 1.3034414052963257, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6901, + "step": 4712 + }, + { + "epoch": 0.06848899670767632, + "grad_norm": 1.3741438388824463, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6875, + "step": 4743 + }, + { + "epoch": 0.06893663720903367, + "grad_norm": 1.7262542247772217, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6868, + "step": 4774 + }, + { + "epoch": 0.06938427771039103, + "grad_norm": 1.3293650150299072, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6891, + "step": 4805 + }, + { + "epoch": 0.0698319182117484, + "grad_norm": 1.3448151350021362, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6765, + "step": 4836 + }, + { + "epoch": 0.07027955871310576, + "grad_norm": 2.961280584335327, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6939, + "step": 4867 + }, + { + "epoch": 0.07072719921446312, + "grad_norm": 1.8265361785888672, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.677, + "step": 4898 + }, + { + "epoch": 0.07117483971582048, + "grad_norm": 1.490349531173706, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6778, + "step": 4929 + }, + { + "epoch": 0.07162248021717785, + "grad_norm": 1.3669307231903076, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6746, + "step": 4960 + }, + { + "epoch": 0.0720701207185352, + "grad_norm": 1.3995884656906128, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6757, + "step": 4991 + }, + { + "epoch": 0.07251776121989256, + "grad_norm": 1.1217372417449951, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6832, + "step": 5022 + }, + { + "epoch": 0.07296540172124993, + "grad_norm": 1.2192097902297974, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6868, + "step": 5053 + }, + { + "epoch": 0.07341304222260729, + "grad_norm": 1.5045067071914673, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6793, + "step": 5084 + }, + { + "epoch": 0.07386068272396465, + "grad_norm": 1.7127234935760498, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6797, + "step": 5115 + }, + { + "epoch": 0.074308323225322, + "grad_norm": 1.846561312675476, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6767, + "step": 5146 + }, + { + "epoch": 0.07475596372667938, + "grad_norm": 1.3076797723770142, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6855, + "step": 5177 + }, + { + "epoch": 0.07520360422803674, + "grad_norm": 1.4170383214950562, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.681, + "step": 5208 + }, + { + "epoch": 0.0756512447293941, + "grad_norm": 1.2504942417144775, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6778, + "step": 5239 + }, + { + "epoch": 0.07609888523075146, + "grad_norm": 1.1522283554077148, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6749, + "step": 5270 + }, + { + "epoch": 0.07654652573210882, + "grad_norm": 1.6351525783538818, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6745, + "step": 5301 + }, + { + "epoch": 0.07699416623346618, + "grad_norm": 1.5093014240264893, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6747, + "step": 5332 + }, + { + "epoch": 0.07744180673482354, + "grad_norm": 1.362160563468933, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.673, + "step": 5363 + }, + { + "epoch": 0.07788944723618091, + "grad_norm": 1.2021727561950684, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6768, + "step": 5394 + }, + { + "epoch": 0.07833708773753827, + "grad_norm": 2.1543540954589844, + "learning_rate": 4.780153554146274e-05, + "loss": 0.672, + "step": 5425 + }, + { + "epoch": 0.07878472823889562, + "grad_norm": 1.882712721824646, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6926, + "step": 5456 + }, + { + "epoch": 0.079232368740253, + "grad_norm": 1.3975650072097778, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6747, + "step": 5487 + }, + { + "epoch": 0.07968000924161035, + "grad_norm": 1.3912913799285889, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6756, + "step": 5518 + }, + { + "epoch": 0.08012764974296771, + "grad_norm": 1.7227635383605957, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6694, + "step": 5549 + }, + { + "epoch": 0.08057529024432507, + "grad_norm": 1.3151129484176636, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6715, + "step": 5580 + }, + { + "epoch": 0.08102293074568244, + "grad_norm": 1.0972425937652588, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6661, + "step": 5611 + }, + { + "epoch": 0.0814705712470398, + "grad_norm": 1.2346575260162354, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.66, + "step": 5642 + }, + { + "epoch": 0.08191821174839715, + "grad_norm": 1.4536516666412354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6743, + "step": 5673 + }, + { + "epoch": 0.08236585224975453, + "grad_norm": 1.1361631155014038, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6791, + "step": 5704 + }, + { + "epoch": 0.08281349275111188, + "grad_norm": 1.2605111598968506, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6797, + "step": 5735 + }, + { + "epoch": 0.08326113325246924, + "grad_norm": 1.2355903387069702, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6642, + "step": 5766 + }, + { + "epoch": 0.0837087737538266, + "grad_norm": 1.6677048206329346, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6759, + "step": 5797 + }, + { + "epoch": 0.08415641425518397, + "grad_norm": 1.4781981706619263, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6707, + "step": 5828 + }, + { + "epoch": 0.08460405475654133, + "grad_norm": 1.138583779335022, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6636, + "step": 5859 + }, + { + "epoch": 0.08505169525789869, + "grad_norm": 1.529036283493042, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6705, + "step": 5890 + }, + { + "epoch": 0.08549933575925606, + "grad_norm": 1.3216760158538818, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6714, + "step": 5921 + }, + { + "epoch": 0.08594697626061341, + "grad_norm": 1.1820168495178223, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6757, + "step": 5952 + }, + { + "epoch": 0.08639461676197077, + "grad_norm": 1.393571138381958, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6672, + "step": 5983 + }, + { + "epoch": 0.08684225726332813, + "grad_norm": 1.4574682712554932, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6655, + "step": 6014 + }, + { + "epoch": 0.0872898977646855, + "grad_norm": 1.138645887374878, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6663, + "step": 6045 + }, + { + "epoch": 0.08773753826604286, + "grad_norm": 1.7602777481079102, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6695, + "step": 6076 + }, + { + "epoch": 0.08818517876740022, + "grad_norm": 1.2323459386825562, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6636, + "step": 6107 + }, + { + "epoch": 0.08863281926875759, + "grad_norm": 1.6881431341171265, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6653, + "step": 6138 + }, + { + "epoch": 0.08908045977011494, + "grad_norm": 1.391417384147644, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6636, + "step": 6169 + }, + { + "epoch": 0.0895281002714723, + "grad_norm": 1.3066257238388062, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6777, + "step": 6200 + }, + { + "epoch": 0.08997574077282966, + "grad_norm": 1.2001326084136963, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6683, + "step": 6231 + }, + { + "epoch": 0.09042338127418703, + "grad_norm": 1.4361172914505005, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6634, + "step": 6262 + }, + { + "epoch": 0.09087102177554439, + "grad_norm": 8.04520320892334, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6752, + "step": 6293 + }, + { + "epoch": 0.09131866227690175, + "grad_norm": 1.4874210357666016, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6733, + "step": 6324 + }, + { + "epoch": 0.09176630277825912, + "grad_norm": 1.234491229057312, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.668, + "step": 6355 + }, + { + "epoch": 0.09221394327961648, + "grad_norm": 1.2088687419891357, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6705, + "step": 6386 + }, + { + "epoch": 0.09266158378097383, + "grad_norm": 1.1937814950942993, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6853, + "step": 6417 + }, + { + "epoch": 0.09310922428233119, + "grad_norm": 1.5205374956130981, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.685, + "step": 6448 + }, + { + "epoch": 0.09355686478368856, + "grad_norm": 1.2221660614013672, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6622, + "step": 6479 + }, + { + "epoch": 0.09400450528504592, + "grad_norm": 1.0762608051300049, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.669, + "step": 6510 + }, + { + "epoch": 0.09445214578640328, + "grad_norm": 1.4416946172714233, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6674, + "step": 6541 + }, + { + "epoch": 0.09489978628776065, + "grad_norm": 1.559158205986023, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6681, + "step": 6572 + }, + { + "epoch": 0.095347426789118, + "grad_norm": 1.3833891153335571, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6681, + "step": 6603 + }, + { + "epoch": 0.09579506729047536, + "grad_norm": 1.6753300428390503, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6658, + "step": 6634 + }, + { + "epoch": 0.09624270779183272, + "grad_norm": 1.951198697090149, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6654, + "step": 6665 + }, + { + "epoch": 0.09669034829319009, + "grad_norm": 1.2356919050216675, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6651, + "step": 6696 + }, + { + "epoch": 0.09713798879454745, + "grad_norm": 1.2712593078613281, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6646, + "step": 6727 + }, + { + "epoch": 0.09758562929590481, + "grad_norm": 1.1935900449752808, + "learning_rate": 4.608840417313604e-05, + "loss": 0.674, + "step": 6758 + }, + { + "epoch": 0.09803326979726218, + "grad_norm": 1.1649430990219116, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6644, + "step": 6789 + }, + { + "epoch": 0.09848091029861954, + "grad_norm": 1.4281456470489502, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6673, + "step": 6820 + }, + { + "epoch": 0.0989285507999769, + "grad_norm": 1.3064521551132202, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6584, + "step": 6851 + }, + { + "epoch": 0.09937619130133425, + "grad_norm": 1.2546554803848267, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6612, + "step": 6882 + }, + { + "epoch": 0.09982383180269162, + "grad_norm": 1.1866974830627441, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6612, + "step": 6913 + }, + { + "epoch": 0.10027147230404898, + "grad_norm": 1.6166640520095825, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6551, + "step": 6944 + }, + { + "epoch": 0.10071911280540634, + "grad_norm": 1.5471700429916382, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6607, + "step": 6975 + }, + { + "epoch": 0.10116675330676371, + "grad_norm": 1.3361026048660278, + "learning_rate": 4.571999560773736e-05, + "loss": 0.666, + "step": 7006 + }, + { + "epoch": 0.10161439380812107, + "grad_norm": 1.2938140630722046, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6547, + "step": 7037 + }, + { + "epoch": 0.10206203430947842, + "grad_norm": 1.2688400745391846, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6624, + "step": 7068 + }, + { + "epoch": 0.10250967481083578, + "grad_norm": 1.6306285858154297, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6627, + "step": 7099 + }, + { + "epoch": 0.10295731531219315, + "grad_norm": 1.3346176147460938, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6577, + "step": 7130 + }, + { + "epoch": 0.10340495581355051, + "grad_norm": 1.0933984518051147, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6602, + "step": 7161 + }, + { + "epoch": 0.10385259631490787, + "grad_norm": 1.575859785079956, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6593, + "step": 7192 + }, + { + "epoch": 0.10430023681626524, + "grad_norm": 1.4265861511230469, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6564, + "step": 7223 + }, + { + "epoch": 0.1047478773176226, + "grad_norm": 1.737012267112732, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6593, + "step": 7254 + }, + { + "epoch": 0.10519551781897996, + "grad_norm": 1.0257115364074707, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6514, + "step": 7285 + }, + { + "epoch": 0.10564315832033731, + "grad_norm": 1.5043773651123047, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6635, + "step": 7316 + }, + { + "epoch": 0.10609079882169468, + "grad_norm": 1.5642234086990356, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6558, + "step": 7347 + }, + { + "epoch": 0.10653843932305204, + "grad_norm": 1.1821067333221436, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6686, + "step": 7378 + }, + { + "epoch": 0.1069860798244094, + "grad_norm": 1.5492759943008423, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6543, + "step": 7409 + }, + { + "epoch": 0.10743372032576677, + "grad_norm": 1.281914234161377, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6516, + "step": 7440 + }, + { + "epoch": 0.10788136082712413, + "grad_norm": 1.3318305015563965, + "learning_rate": 4.498911497712155e-05, + "loss": 0.656, + "step": 7471 + }, + { + "epoch": 0.10832900132848149, + "grad_norm": 1.3832449913024902, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6475, + "step": 7502 + }, + { + "epoch": 0.10877664182983884, + "grad_norm": 1.3547158241271973, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6594, + "step": 7533 + }, + { + "epoch": 0.10922428233119622, + "grad_norm": 1.4633681774139404, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6707, + "step": 7564 + }, + { + "epoch": 0.10967192283255357, + "grad_norm": 1.1781059503555298, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6626, + "step": 7595 + }, + { + "epoch": 0.11011956333391093, + "grad_norm": 1.4727883338928223, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.6535, + "step": 7626 + }, + { + "epoch": 0.1105672038352683, + "grad_norm": 1.3888640403747559, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6606, + "step": 7657 + }, + { + "epoch": 0.11101484433662566, + "grad_norm": 1.308769941329956, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6667, + "step": 7688 + }, + { + "epoch": 0.11146248483798302, + "grad_norm": 1.3579630851745605, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6621, + "step": 7719 + }, + { + "epoch": 0.11191012533934037, + "grad_norm": 1.1285645961761475, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6597, + "step": 7750 + }, + { + "epoch": 0.11235776584069775, + "grad_norm": 1.1144675016403198, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6638, + "step": 7781 + }, + { + "epoch": 0.1128054063420551, + "grad_norm": 1.1993658542633057, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6704, + "step": 7812 + }, + { + "epoch": 0.11325304684341246, + "grad_norm": 1.0608967542648315, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6573, + "step": 7843 + }, + { + "epoch": 0.11370068734476983, + "grad_norm": 1.1210070848464966, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6612, + "step": 7874 + }, + { + "epoch": 0.11414832784612719, + "grad_norm": 1.2345409393310547, + "learning_rate": 4.42611386459262e-05, + "loss": 0.65, + "step": 7905 + }, + { + "epoch": 0.11459596834748455, + "grad_norm": 1.077025294303894, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6595, + "step": 7936 + }, + { + "epoch": 0.1150436088488419, + "grad_norm": 1.2079277038574219, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.6442, + "step": 7967 + }, + { + "epoch": 0.11549124935019928, + "grad_norm": 1.6710035800933838, + "learning_rate": 4.409954541451762e-05, + "loss": 0.663, + "step": 7998 + }, + { + "epoch": 0.11593888985155663, + "grad_norm": 1.3124401569366455, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6512, + "step": 8029 + }, + { + "epoch": 0.11638653035291399, + "grad_norm": 1.644904375076294, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6587, + "step": 8060 + }, + { + "epoch": 0.11683417085427136, + "grad_norm": 1.181624174118042, + "learning_rate": 4.393601237573607e-05, + "loss": 0.653, + "step": 8091 + }, + { + "epoch": 0.11728181135562872, + "grad_norm": 1.4587918519973755, + "learning_rate": 4.388107315953628e-05, + "loss": 0.675, + "step": 8122 + }, + { + "epoch": 0.11772945185698608, + "grad_norm": 1.2147635221481323, + "learning_rate": 4.382592087299212e-05, + "loss": 0.6521, + "step": 8153 + }, + { + "epoch": 0.11817709235834344, + "grad_norm": 1.0448981523513794, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6541, + "step": 8184 + }, + { + "epoch": 0.11862473285970081, + "grad_norm": 1.4482290744781494, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6563, + "step": 8215 + }, + { + "epoch": 0.11907237336105816, + "grad_norm": 1.1621575355529785, + "learning_rate": 4.365919182713416e-05, + "loss": 0.656, + "step": 8246 + }, + { + "epoch": 0.11952001386241552, + "grad_norm": 1.1643873453140259, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6547, + "step": 8277 + }, + { + "epoch": 0.1199676543637729, + "grad_norm": 1.4016129970550537, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6582, + "step": 8308 + }, + { + "epoch": 0.12041529486513025, + "grad_norm": 1.4023685455322266, + "learning_rate": 4.349056769754021e-05, + "loss": 0.6621, + "step": 8339 + }, + { + "epoch": 0.12086293536648761, + "grad_norm": 1.3020285367965698, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6674, + "step": 8370 + }, + { + "epoch": 0.12131057586784497, + "grad_norm": 1.2162435054779053, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6614, + "step": 8401 + }, + { + "epoch": 0.12175821636920234, + "grad_norm": 1.1956969499588013, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6557, + "step": 8432 + }, + { + "epoch": 0.1222058568705597, + "grad_norm": 1.1723664999008179, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6633, + "step": 8463 + }, + { + "epoch": 0.12265349737191705, + "grad_norm": 1.113020658493042, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6577, + "step": 8494 + }, + { + "epoch": 0.12310113787327442, + "grad_norm": 1.2453004121780396, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6544, + "step": 8525 + }, + { + "epoch": 0.12354877837463178, + "grad_norm": 1.1493890285491943, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6533, + "step": 8556 + }, + { + "epoch": 0.12399641887598914, + "grad_norm": 1.4172496795654297, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6542, + "step": 8587 + }, + { + "epoch": 0.1244440593773465, + "grad_norm": 1.1840728521347046, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6476, + "step": 8618 + }, + { + "epoch": 0.12489169987870387, + "grad_norm": 1.3720282316207886, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6446, + "step": 8649 + }, + { + "epoch": 0.12533934038006123, + "grad_norm": 1.1705596446990967, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6537, + "step": 8680 + }, + { + "epoch": 0.1257869808814186, + "grad_norm": 1.0790083408355713, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6543, + "step": 8711 + }, + { + "epoch": 0.12623462138277594, + "grad_norm": 1.1207470893859863, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6468, + "step": 8742 + }, + { + "epoch": 0.1266822618841333, + "grad_norm": 1.1123065948486328, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6596, + "step": 8773 + }, + { + "epoch": 0.12712990238549068, + "grad_norm": 1.1579636335372925, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6458, + "step": 8804 + }, + { + "epoch": 0.12757754288684803, + "grad_norm": 1.3112802505493164, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6483, + "step": 8835 + }, + { + "epoch": 0.1280251833882054, + "grad_norm": 1.1104832887649536, + "learning_rate": 4.250007230372134e-05, + "loss": 0.645, + "step": 8866 + }, + { + "epoch": 0.12847282388956274, + "grad_norm": 1.0218713283538818, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6519, + "step": 8897 + }, + { + "epoch": 0.12892046439092011, + "grad_norm": 1.0053678750991821, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.6468, + "step": 8928 + }, + { + "epoch": 0.12936810489227749, + "grad_norm": 1.410933017730713, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6573, + "step": 8959 + }, + { + "epoch": 0.12981574539363483, + "grad_norm": 1.5001798868179321, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6483, + "step": 8990 + }, + { + "epoch": 0.1302633858949922, + "grad_norm": 1.112316608428955, + "learning_rate": 4.219774185874569e-05, + "loss": 0.6483, + "step": 9021 + }, + { + "epoch": 0.13071102639634957, + "grad_norm": 1.527464747428894, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6512, + "step": 9052 + }, + { + "epoch": 0.13115866689770692, + "grad_norm": 1.1075704097747803, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6577, + "step": 9083 + }, + { + "epoch": 0.1316063073990643, + "grad_norm": 0.9589399099349976, + "learning_rate": 4.201400923825648e-05, + "loss": 0.642, + "step": 9114 + }, + { + "epoch": 0.13205394790042166, + "grad_norm": 1.186531901359558, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6553, + "step": 9145 + }, + { + "epoch": 0.132501588401779, + "grad_norm": 1.1176280975341797, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6483, + "step": 9176 + }, + { + "epoch": 0.13294922890313637, + "grad_norm": 1.4222681522369385, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6462, + "step": 9207 + }, + { + "epoch": 0.13339686940449375, + "grad_norm": 1.1606040000915527, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6514, + "step": 9238 + }, + { + "epoch": 0.1338445099058511, + "grad_norm": 1.313774585723877, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6422, + "step": 9269 + }, + { + "epoch": 0.13429215040720846, + "grad_norm": 1.1994171142578125, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6444, + "step": 9300 + }, + { + "epoch": 0.1347397909085658, + "grad_norm": 0.9376353025436401, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6436, + "step": 9331 + }, + { + "epoch": 0.13518743140992318, + "grad_norm": 1.0350178480148315, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6447, + "step": 9362 + }, + { + "epoch": 0.13563507191128055, + "grad_norm": 1.3630082607269287, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6427, + "step": 9393 + }, + { + "epoch": 0.1360827124126379, + "grad_norm": 1.0362364053726196, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6443, + "step": 9424 + }, + { + "epoch": 0.13653035291399526, + "grad_norm": 1.1442211866378784, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.6523, + "step": 9455 + }, + { + "epoch": 0.13697799341535263, + "grad_norm": 1.4196938276290894, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6501, + "step": 9486 + }, + { + "epoch": 0.13742563391670998, + "grad_norm": 1.3853130340576172, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.6431, + "step": 9517 + }, + { + "epoch": 0.13787327441806735, + "grad_norm": 1.0350273847579956, + "learning_rate": 4.113404868280107e-05, + "loss": 0.6436, + "step": 9548 + }, + { + "epoch": 0.13832091491942472, + "grad_norm": 1.0520857572555542, + "learning_rate": 4.106980553625457e-05, + "loss": 0.6436, + "step": 9579 + }, + { + "epoch": 0.13876855542078206, + "grad_norm": 1.127038836479187, + "learning_rate": 4.100538104413674e-05, + "loss": 0.639, + "step": 9610 + }, + { + "epoch": 0.13921619592213944, + "grad_norm": 1.1070880889892578, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6366, + "step": 9641 + }, + { + "epoch": 0.1396638364234968, + "grad_norm": 1.3045605421066284, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6496, + "step": 9672 + }, + { + "epoch": 0.14011147692485415, + "grad_norm": 1.234647512435913, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6383, + "step": 9703 + }, + { + "epoch": 0.14055911742621152, + "grad_norm": 1.1154453754425049, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6454, + "step": 9734 + }, + { + "epoch": 0.14100675792756887, + "grad_norm": 1.2422186136245728, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6404, + "step": 9765 + }, + { + "epoch": 0.14145439842892624, + "grad_norm": 1.2128278017044067, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6376, + "step": 9796 + }, + { + "epoch": 0.1419020389302836, + "grad_norm": 1.3140804767608643, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.6316, + "step": 9827 + }, + { + "epoch": 0.14234967943164095, + "grad_norm": 1.9198187589645386, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6383, + "step": 9858 + }, + { + "epoch": 0.14279731993299832, + "grad_norm": 1.144679307937622, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.6383, + "step": 9889 + }, + { + "epoch": 0.1432449604343557, + "grad_norm": 1.1679338216781616, + "learning_rate": 4.035132306369438e-05, + "loss": 0.6414, + "step": 9920 + }, + { + "epoch": 0.14369260093571304, + "grad_norm": 0.9563717246055603, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6327, + "step": 9951 + }, + { + "epoch": 0.1441402414370704, + "grad_norm": 1.277036428451538, + "learning_rate": 4.021840884378864e-05, + "loss": 0.6365, + "step": 9982 + }, + { + "epoch": 0.14458788193842778, + "grad_norm": 0.9835182428359985, + "learning_rate": 4.015169375185633e-05, + "loss": 0.638, + "step": 10013 + }, + { + "epoch": 0.14503552243978513, + "grad_norm": 1.090118646621704, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.6437, + "step": 10044 + }, + { + "epoch": 0.1454831629411425, + "grad_norm": 1.1823488473892212, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6326, + "step": 10075 + }, + { + "epoch": 0.14593080344249987, + "grad_norm": 1.070318341255188, + "learning_rate": 3.995052558835377e-05, + "loss": 0.6362, + "step": 10106 + }, + { + "epoch": 0.1463784439438572, + "grad_norm": 1.2451491355895996, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6388, + "step": 10137 + }, + { + "epoch": 0.14682608444521458, + "grad_norm": 1.2417365312576294, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6309, + "step": 10168 + }, + { + "epoch": 0.14727372494657193, + "grad_norm": 1.251518726348877, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6365, + "step": 10199 + }, + { + "epoch": 0.1477213654479293, + "grad_norm": 1.359750747680664, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.6386, + "step": 10230 + }, + { + "epoch": 0.14816900594928667, + "grad_norm": 1.1073262691497803, + "learning_rate": 3.961188120762596e-05, + "loss": 0.635, + "step": 10261 + }, + { + "epoch": 0.148616646450644, + "grad_norm": 0.9850608706474304, + "learning_rate": 3.954365458554938e-05, + "loss": 0.6389, + "step": 10292 + }, + { + "epoch": 0.14906428695200138, + "grad_norm": 1.2984429597854614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.6317, + "step": 10323 + }, + { + "epoch": 0.14951192745335876, + "grad_norm": 1.1622575521469116, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6393, + "step": 10354 + }, + { + "epoch": 0.1499595679547161, + "grad_norm": 1.0856871604919434, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6351, + "step": 10385 + }, + { + "epoch": 0.15040720845607347, + "grad_norm": 1.0153882503509521, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6282, + "step": 10416 + }, + { + "epoch": 0.15085484895743084, + "grad_norm": 1.2039254903793335, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6365, + "step": 10447 + }, + { + "epoch": 0.1513024894587882, + "grad_norm": 1.1179555654525757, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6311, + "step": 10478 + }, + { + "epoch": 0.15175012996014556, + "grad_norm": 1.090903639793396, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.6401, + "step": 10509 + }, + { + "epoch": 0.15219777046150293, + "grad_norm": 0.9228240251541138, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.6323, + "step": 10540 + }, + { + "epoch": 0.15264541096286027, + "grad_norm": 1.0772743225097656, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.637, + "step": 10571 + }, + { + "epoch": 0.15309305146421764, + "grad_norm": 1.0854771137237549, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6375, + "step": 10602 + }, + { + "epoch": 0.153540691965575, + "grad_norm": 1.3902987241744995, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.6406, + "step": 10633 + }, + { + "epoch": 0.15398833246693236, + "grad_norm": 1.180351734161377, + "learning_rate": 3.871235554965218e-05, + "loss": 0.626, + "step": 10664 + }, + { + "epoch": 0.15443597296828973, + "grad_norm": 1.1136449575424194, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6395, + "step": 10695 + }, + { + "epoch": 0.15488361346964707, + "grad_norm": 1.1770708560943604, + "learning_rate": 3.857160259406107e-05, + "loss": 0.633, + "step": 10726 + }, + { + "epoch": 0.15533125397100445, + "grad_norm": 1.1615066528320312, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.632, + "step": 10757 + }, + { + "epoch": 0.15577889447236182, + "grad_norm": 1.2898380756378174, + "learning_rate": 3.843023702543556e-05, + "loss": 0.6332, + "step": 10788 + }, + { + "epoch": 0.15622653497371916, + "grad_norm": 1.0051672458648682, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.6353, + "step": 10819 + }, + { + "epoch": 0.15667417547507653, + "grad_norm": 0.9514272212982178, + "learning_rate": 3.828826522492255e-05, + "loss": 0.6383, + "step": 10850 + }, + { + "epoch": 0.1571218159764339, + "grad_norm": 1.2570873498916626, + "learning_rate": 3.821705398930713e-05, + "loss": 0.6308, + "step": 10881 + }, + { + "epoch": 0.15756945647779125, + "grad_norm": 0.992323637008667, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6303, + "step": 10912 + }, + { + "epoch": 0.15801709697914862, + "grad_norm": 1.255265474319458, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6349, + "step": 10943 + }, + { + "epoch": 0.158464737480506, + "grad_norm": 1.1066702604293823, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6317, + "step": 10974 + }, + { + "epoch": 0.15891237798186333, + "grad_norm": 1.178690791130066, + "learning_rate": 3.793072558155093e-05, + "loss": 0.639, + "step": 11005 + }, + { + "epoch": 0.1593600184832207, + "grad_norm": 1.0850341320037842, + "learning_rate": 3.785877665226426e-05, + "loss": 0.6375, + "step": 11036 + }, + { + "epoch": 0.15980765898457805, + "grad_norm": 1.1378651857376099, + "learning_rate": 3.778668261343079e-05, + "loss": 0.6287, + "step": 11067 + }, + { + "epoch": 0.16025529948593542, + "grad_norm": 1.07688570022583, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6261, + "step": 11098 + }, + { + "epoch": 0.1607029399872928, + "grad_norm": 1.108269453048706, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.6352, + "step": 11129 + }, + { + "epoch": 0.16115058048865014, + "grad_norm": 1.2582095861434937, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.1615982209900075, + "grad_norm": 0.9823578000068665, + "learning_rate": 3.749687165842753e-05, + "loss": 0.6253, + "step": 11191 + }, + { + "epoch": 0.16204586149136488, + "grad_norm": 1.3922805786132812, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.6279, + "step": 11222 + }, + { + "epoch": 0.16249350199272222, + "grad_norm": 1.2210962772369385, + "learning_rate": 3.735111675341645e-05, + "loss": 0.6357, + "step": 11253 + }, + { + "epoch": 0.1629411424940796, + "grad_norm": 1.0463316440582275, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.6332, + "step": 11284 + }, + { + "epoch": 0.16338878299543697, + "grad_norm": 1.165583848953247, + "learning_rate": 3.720480432728287e-05, + "loss": 0.627, + "step": 11315 + }, + { + "epoch": 0.1638364234967943, + "grad_norm": 1.0995306968688965, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6283, + "step": 11346 + }, + { + "epoch": 0.16428406399815168, + "grad_norm": 1.0279158353805542, + "learning_rate": 3.70579409844715e-05, + "loss": 0.6287, + "step": 11377 + }, + { + "epoch": 0.16473170449950905, + "grad_norm": 1.51092529296875, + "learning_rate": 3.698430479000865e-05, + "loss": 0.6261, + "step": 11408 + }, + { + "epoch": 0.1651793450008664, + "grad_norm": 1.020936369895935, + "learning_rate": 3.691053335429509e-05, + "loss": 0.6327, + "step": 11439 + }, + { + "epoch": 0.16562698550222377, + "grad_norm": 1.0198683738708496, + "learning_rate": 3.683662750983147e-05, + "loss": 0.6422, + "step": 11470 + }, + { + "epoch": 0.1660746260035811, + "grad_norm": 1.2650995254516602, + "learning_rate": 3.676258809063518e-05, + "loss": 0.6354, + "step": 11501 + }, + { + "epoch": 0.16652226650493848, + "grad_norm": 1.1653568744659424, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6325, + "step": 11532 + }, + { + "epoch": 0.16696990700629585, + "grad_norm": 1.1461430788040161, + "learning_rate": 3.661411187164166e-05, + "loss": 0.6251, + "step": 11563 + }, + { + "epoch": 0.1674175475076532, + "grad_norm": 1.2535974979400635, + "learning_rate": 3.65396767473784e-05, + "loss": 0.6245, + "step": 11594 + }, + { + "epoch": 0.16786518800901057, + "grad_norm": 1.115191102027893, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.6294, + "step": 11625 + }, + { + "epoch": 0.16831282851036794, + "grad_norm": 1.0482964515686035, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.6247, + "step": 11656 + }, + { + "epoch": 0.16876046901172528, + "grad_norm": 1.1431951522827148, + "learning_rate": 3.63155933997859e-05, + "loss": 0.63, + "step": 11687 + }, + { + "epoch": 0.16920810951308266, + "grad_norm": 1.0254175662994385, + "learning_rate": 3.624064243537758e-05, + "loss": 0.6212, + "step": 11718 + }, + { + "epoch": 0.16965575001444003, + "grad_norm": 0.9481080174446106, + "learning_rate": 3.616556462184716e-05, + "loss": 0.6278, + "step": 11749 + }, + { + "epoch": 0.17010339051579737, + "grad_norm": 1.118394374847412, + "learning_rate": 3.609036080643755e-05, + "loss": 0.6244, + "step": 11780 + }, + { + "epoch": 0.17055103101715474, + "grad_norm": 1.1592167615890503, + "learning_rate": 3.60150318378136e-05, + "loss": 0.621, + "step": 11811 + }, + { + "epoch": 0.1709986715185121, + "grad_norm": 0.9984686374664307, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.6319, + "step": 11842 + }, + { + "epoch": 0.17144631201986946, + "grad_norm": 1.0091164112091064, + "learning_rate": 3.586400184263408e-05, + "loss": 0.6345, + "step": 11873 + }, + { + "epoch": 0.17189395252122683, + "grad_norm": 1.0355888605117798, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6171, + "step": 11904 + }, + { + "epoch": 0.17234159302258417, + "grad_norm": 1.1437592506408691, + "learning_rate": 3.571248145370125e-05, + "loss": 0.6201, + "step": 11935 + }, + { + "epoch": 0.17278923352394154, + "grad_norm": 0.9440962672233582, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.6236, + "step": 11966 + }, + { + "epoch": 0.17323687402529891, + "grad_norm": 0.9761082530021667, + "learning_rate": 3.556047751054378e-05, + "loss": 0.6291, + "step": 11997 + }, + { + "epoch": 0.17368451452665626, + "grad_norm": 1.1858127117156982, + "learning_rate": 3.548429634946039e-05, + "loss": 0.6299, + "step": 12028 + }, + { + "epoch": 0.17413215502801363, + "grad_norm": 1.0180195569992065, + "learning_rate": 3.540799687451768e-05, + "loss": 0.6227, + "step": 12059 + }, + { + "epoch": 0.174579795529371, + "grad_norm": 0.9683852195739746, + "learning_rate": 3.533157994674485e-05, + "loss": 0.626, + "step": 12090 + }, + { + "epoch": 0.17502743603072834, + "grad_norm": 1.0338289737701416, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.6377, + "step": 12121 + }, + { + "epoch": 0.17547507653208572, + "grad_norm": 1.1238298416137695, + "learning_rate": 3.517839718344311e-05, + "loss": 0.6338, + "step": 12152 + }, + { + "epoch": 0.1759227170334431, + "grad_norm": 1.0541973114013672, + "learning_rate": 3.510163307656086e-05, + "loss": 0.6222, + "step": 12183 + }, + { + "epoch": 0.17637035753480043, + "grad_norm": 1.1677592992782593, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.6256, + "step": 12214 + }, + { + "epoch": 0.1768179980361578, + "grad_norm": 1.0191985368728638, + "learning_rate": 3.494776374368643e-05, + "loss": 0.6301, + "step": 12245 + }, + { + "epoch": 0.17726563853751517, + "grad_norm": 1.1535918712615967, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.6253, + "step": 12276 + }, + { + "epoch": 0.17771327903887252, + "grad_norm": 1.0887985229492188, + "learning_rate": 3.479344537543164e-05, + "loss": 0.6335, + "step": 12307 + }, + { + "epoch": 0.1781609195402299, + "grad_norm": 1.010688066482544, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.6251, + "step": 12338 + }, + { + "epoch": 0.17860856004158723, + "grad_norm": 0.9745127558708191, + "learning_rate": 3.463868493762412e-05, + "loss": 0.6241, + "step": 12369 + }, + { + "epoch": 0.1790562005429446, + "grad_norm": 1.0414716005325317, + "learning_rate": 3.456114112492418e-05, + "loss": 0.6237, + "step": 12400 + }, + { + "epoch": 0.17950384104430198, + "grad_norm": 1.0457465648651123, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.6258, + "step": 12431 + }, + { + "epoch": 0.17995148154565932, + "grad_norm": 1.0389049053192139, + "learning_rate": 3.440573068727905e-05, + "loss": 0.6262, + "step": 12462 + }, + { + "epoch": 0.1803991220470167, + "grad_norm": 1.255600094795227, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.6305, + "step": 12493 + }, + { + "epoch": 0.18084676254837406, + "grad_norm": 1.0340358018875122, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.6212, + "step": 12524 + }, + { + "epoch": 0.1812944030497314, + "grad_norm": 1.0317034721374512, + "learning_rate": 3.417182116258899e-05, + "loss": 0.6279, + "step": 12555 + }, + { + "epoch": 0.18174204355108878, + "grad_norm": 1.1320221424102783, + "learning_rate": 3.409364314116074e-05, + "loss": 0.631, + "step": 12586 + }, + { + "epoch": 0.18218968405244615, + "grad_norm": 0.9674787521362305, + "learning_rate": 3.401536249920559e-05, + "loss": 0.627, + "step": 12617 + }, + { + "epoch": 0.1826373245538035, + "grad_norm": 0.9329623579978943, + "learning_rate": 3.393698012010998e-05, + "loss": 0.6244, + "step": 12648 + }, + { + "epoch": 0.18308496505516086, + "grad_norm": 1.2081501483917236, + "learning_rate": 3.385849688840839e-05, + "loss": 0.6295, + "step": 12679 + }, + { + "epoch": 0.18353260555651824, + "grad_norm": 0.9842090010643005, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.6276, + "step": 12710 + }, + { + "epoch": 0.18398024605787558, + "grad_norm": 1.1417752504348755, + "learning_rate": 3.370123141100578e-05, + "loss": 0.6266, + "step": 12741 + }, + { + "epoch": 0.18442788655923295, + "grad_norm": 0.9693592190742493, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.6245, + "step": 12772 + }, + { + "epoch": 0.1848755270605903, + "grad_norm": 1.1920111179351807, + "learning_rate": 3.35435731658559e-05, + "loss": 0.6227, + "step": 12803 + }, + { + "epoch": 0.18532316756194767, + "grad_norm": 0.9865401387214661, + "learning_rate": 3.346459897862552e-05, + "loss": 0.6287, + "step": 12834 + }, + { + "epoch": 0.18577080806330504, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.338552926954613e-05, + "loss": 0.6236, + "step": 12865 + }, + { + "epoch": 0.18621844856466238, + "grad_norm": 1.0202548503875732, + "learning_rate": 3.330636493090868e-05, + "loss": 0.6269, + "step": 12896 + }, + { + "epoch": 0.18666608906601975, + "grad_norm": 1.1385433673858643, + "learning_rate": 3.322710685607193e-05, + "loss": 0.6385, + "step": 12927 + }, + { + "epoch": 0.18711372956737712, + "grad_norm": 1.0102901458740234, + "learning_rate": 3.314775593945251e-05, + "loss": 0.6241, + "step": 12958 + }, + { + "epoch": 0.18756137006873447, + "grad_norm": 0.9830989241600037, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.6243, + "step": 12989 + }, + { + "epoch": 0.18800901057009184, + "grad_norm": 1.0044376850128174, + "learning_rate": 3.298877916376047e-05, + "loss": 0.619, + "step": 13020 + }, + { + "epoch": 0.1884566510714492, + "grad_norm": 1.0714712142944336, + "learning_rate": 3.290915509871915e-05, + "loss": 0.6243, + "step": 13051 + }, + { + "epoch": 0.18890429157280655, + "grad_norm": 0.9379229545593262, + "learning_rate": 3.282944177993753e-05, + "loss": 0.6216, + "step": 13082 + }, + { + "epoch": 0.18935193207416393, + "grad_norm": 1.2717514038085938, + "learning_rate": 3.274964010696957e-05, + "loss": 0.6206, + "step": 13113 + }, + { + "epoch": 0.1897995725755213, + "grad_norm": 1.1147576570510864, + "learning_rate": 3.266975098036629e-05, + "loss": 0.6234, + "step": 13144 + }, + { + "epoch": 0.19024721307687864, + "grad_norm": 0.9994730949401855, + "learning_rate": 3.258977530166562e-05, + "loss": 0.6146, + "step": 13175 + }, + { + "epoch": 0.190694853578236, + "grad_norm": 1.195367693901062, + "learning_rate": 3.250971397338227e-05, + "loss": 0.624, + "step": 13206 + }, + { + "epoch": 0.19114249407959336, + "grad_norm": 1.0008747577667236, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.6182, + "step": 13237 + }, + { + "epoch": 0.19159013458095073, + "grad_norm": 1.3223299980163574, + "learning_rate": 3.234933798294859e-05, + "loss": 0.6193, + "step": 13268 + }, + { + "epoch": 0.1920377750823081, + "grad_norm": 1.1946437358856201, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.6201, + "step": 13299 + }, + { + "epoch": 0.19248541558366544, + "grad_norm": 1.1597986221313477, + "learning_rate": 3.218863024832985e-05, + "loss": 0.6212, + "step": 13330 + }, + { + "epoch": 0.1929330560850228, + "grad_norm": 0.9518936276435852, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.6154, + "step": 13361 + }, + { + "epoch": 0.19338069658638019, + "grad_norm": 0.890487790107727, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.6203, + "step": 13392 + }, + { + "epoch": 0.19382833708773753, + "grad_norm": 0.9918534755706787, + "learning_rate": 3.194696249871729e-05, + "loss": 0.6319, + "step": 13423 + }, + { + "epoch": 0.1942759775890949, + "grad_norm": 1.1954073905944824, + "learning_rate": 3.186624857814164e-05, + "loss": 0.619, + "step": 13454 + }, + { + "epoch": 0.19472361809045227, + "grad_norm": 1.1521157026290894, + "learning_rate": 3.178545717288401e-05, + "loss": 0.6326, + "step": 13485 + }, + { + "epoch": 0.19517125859180962, + "grad_norm": 1.0131208896636963, + "learning_rate": 3.170458919466444e-05, + "loss": 0.6234, + "step": 13516 + }, + { + "epoch": 0.195618899093167, + "grad_norm": 1.0429494380950928, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.6146, + "step": 13547 + }, + { + "epoch": 0.19606653959452436, + "grad_norm": 0.9586461782455444, + "learning_rate": 3.154262717052985e-05, + "loss": 0.6192, + "step": 13578 + }, + { + "epoch": 0.1965141800958817, + "grad_norm": 0.9385515451431274, + "learning_rate": 3.146153495233426e-05, + "loss": 0.6186, + "step": 13609 + }, + { + "epoch": 0.19696182059723907, + "grad_norm": 0.9109722375869751, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.6223, + "step": 13640 + }, + { + "epoch": 0.19740946109859642, + "grad_norm": 1.0564444065093994, + "learning_rate": 3.129913267924946e-05, + "loss": 0.6235, + "step": 13671 + }, + { + "epoch": 0.1978571015999538, + "grad_norm": 1.1656286716461182, + "learning_rate": 3.121782445704782e-05, + "loss": 0.6176, + "step": 13702 + }, + { + "epoch": 0.19830474210131116, + "grad_norm": 1.1301069259643555, + "learning_rate": 3.11364460675423e-05, + "loss": 0.6253, + "step": 13733 + }, + { + "epoch": 0.1987523826026685, + "grad_norm": 0.9939395785331726, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.6223, + "step": 13764 + }, + { + "epoch": 0.19920002310402588, + "grad_norm": 1.2881885766983032, + "learning_rate": 3.097348246077728e-05, + "loss": 0.6177, + "step": 13795 + }, + { + "epoch": 0.19964766360538325, + "grad_norm": 1.1002579927444458, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.6139, + "step": 13826 + }, + { + "epoch": 0.2000953041067406, + "grad_norm": 1.045394778251648, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.6192, + "step": 13857 + }, + { + "epoch": 0.20054294460809796, + "grad_norm": 0.9559116959571838, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.6155, + "step": 13888 + }, + { + "epoch": 0.20099058510945533, + "grad_norm": 0.9250887036323547, + "learning_rate": 3.064675369851637e-05, + "loss": 0.6235, + "step": 13919 + }, + { + "epoch": 0.20143822561081268, + "grad_norm": 1.0655368566513062, + "learning_rate": 3.056490989455289e-05, + "loss": 0.628, + "step": 13950 + }, + { + "epoch": 0.20188586611217005, + "grad_norm": 1.07636559009552, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.6244, + "step": 13981 + }, + { + "epoch": 0.20233350661352742, + "grad_norm": 1.050580620765686, + "learning_rate": 3.040103481317539e-05, + "loss": 0.6222, + "step": 14012 + }, + { + "epoch": 0.20278114711488476, + "grad_norm": 1.3754404783248901, + "learning_rate": 3.03190053850694e-05, + "loss": 0.6151, + "step": 14043 + }, + { + "epoch": 0.20322878761624213, + "grad_norm": 1.0527547597885132, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.6153, + "step": 14074 + }, + { + "epoch": 0.20367642811759948, + "grad_norm": 0.9438226819038391, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.618, + "step": 14105 + }, + { + "epoch": 0.20412406861895685, + "grad_norm": 1.0383126735687256, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.6162, + "step": 14136 + }, + { + "epoch": 0.20457170912031422, + "grad_norm": 1.1412239074707031, + "learning_rate": 2.999029669712431e-05, + "loss": 0.6284, + "step": 14167 + }, + { + "epoch": 0.20501934962167156, + "grad_norm": 1.1064159870147705, + "learning_rate": 2.990797641805408e-05, + "loss": 0.6223, + "step": 14198 + }, + { + "epoch": 0.20546699012302894, + "grad_norm": 1.0044069290161133, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6191, + "step": 14229 + }, + { + "epoch": 0.2059146306243863, + "grad_norm": 0.9315604567527771, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.6207, + "step": 14260 + }, + { + "epoch": 0.20636227112574365, + "grad_norm": 0.941224217414856, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.6207, + "step": 14291 + }, + { + "epoch": 0.20680991162710102, + "grad_norm": 1.1239089965820312, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.6252, + "step": 14322 + }, + { + "epoch": 0.2072575521284584, + "grad_norm": 0.9484926462173462, + "learning_rate": 2.949556283611942e-05, + "loss": 0.6136, + "step": 14353 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.9437084197998047, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.6154, + "step": 14384 + }, + { + "epoch": 0.2081528331311731, + "grad_norm": 0.9578093886375427, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.6191, + "step": 14415 + }, + { + "epoch": 0.20860047363253048, + "grad_norm": 1.0657248497009277, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.6178, + "step": 14446 + }, + { + "epoch": 0.20904811413388782, + "grad_norm": 0.853568434715271, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6152, + "step": 14477 + }, + { + "epoch": 0.2094957546352452, + "grad_norm": 1.0403015613555908, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.6108, + "step": 14508 + }, + { + "epoch": 0.20994339513660254, + "grad_norm": 1.0344171524047852, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.6155, + "step": 14539 + }, + { + "epoch": 0.2103910356379599, + "grad_norm": 1.0755060911178589, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.6156, + "step": 14570 + }, + { + "epoch": 0.21083867613931728, + "grad_norm": 0.8636776208877563, + "learning_rate": 2.883311164593017e-05, + "loss": 0.6193, + "step": 14601 + }, + { + "epoch": 0.21128631664067463, + "grad_norm": 1.0264644622802734, + "learning_rate": 2.875010077160754e-05, + "loss": 0.6138, + "step": 14632 + }, + { + "epoch": 0.211733957142032, + "grad_norm": 1.2590196132659912, + "learning_rate": 2.866704757790741e-05, + "loss": 0.6202, + "step": 14663 + }, + { + "epoch": 0.21218159764338937, + "grad_norm": 1.1028645038604736, + "learning_rate": 2.858395300207376e-05, + "loss": 0.614, + "step": 14694 + }, + { + "epoch": 0.2126292381447467, + "grad_norm": 0.8904405236244202, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.6152, + "step": 14725 + }, + { + "epoch": 0.21307687864610408, + "grad_norm": 0.9810163974761963, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.6088, + "step": 14756 + }, + { + "epoch": 0.21352451914746146, + "grad_norm": 0.9837898015975952, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.6129, + "step": 14787 + }, + { + "epoch": 0.2139721596488188, + "grad_norm": 0.987639844417572, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.6081, + "step": 14818 + }, + { + "epoch": 0.21441980015017617, + "grad_norm": 1.1478586196899414, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.6146, + "step": 14849 + }, + { + "epoch": 0.21486744065153354, + "grad_norm": 1.0885242223739624, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.6183, + "step": 14880 + }, + { + "epoch": 0.21531508115289089, + "grad_norm": 0.9934699535369873, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.6157, + "step": 14911 + }, + { + "epoch": 0.21576272165424826, + "grad_norm": 0.9285492300987244, + "learning_rate": 2.791781925709473e-05, + "loss": 0.6196, + "step": 14942 + }, + { + "epoch": 0.2162103621556056, + "grad_norm": 1.243133783340454, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.6109, + "step": 14973 + }, + { + "epoch": 0.21665800265696297, + "grad_norm": 1.0712559223175049, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.6268, + "step": 15004 + }, + { + "epoch": 0.21710564315832034, + "grad_norm": 1.0762903690338135, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.6162, + "step": 15035 + }, + { + "epoch": 0.2175532836596777, + "grad_norm": 1.043479084968567, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6174, + "step": 15066 + }, + { + "epoch": 0.21800092416103506, + "grad_norm": 0.9906991720199585, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.6153, + "step": 15097 + }, + { + "epoch": 0.21844856466239243, + "grad_norm": 0.8844815492630005, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.6113, + "step": 15128 + }, + { + "epoch": 0.21889620516374977, + "grad_norm": 1.0258604288101196, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.6111, + "step": 15159 + }, + { + "epoch": 0.21934384566510715, + "grad_norm": 0.8992047309875488, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6083, + "step": 15190 + }, + { + "epoch": 0.21979148616646452, + "grad_norm": 0.991061806678772, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.6219, + "step": 15221 + }, + { + "epoch": 0.22023912666782186, + "grad_norm": 0.9700108766555786, + "learning_rate": 2.708224532974953e-05, + "loss": 0.6119, + "step": 15252 + }, + { + "epoch": 0.22068676716917923, + "grad_norm": 0.904680609703064, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.6135, + "step": 15283 + }, + { + "epoch": 0.2211344076705366, + "grad_norm": 0.9015173316001892, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.611, + "step": 15314 + }, + { + "epoch": 0.22158204817189395, + "grad_norm": 1.020070195198059, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.617, + "step": 15345 + }, + { + "epoch": 0.22202968867325132, + "grad_norm": 1.0938650369644165, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.6135, + "step": 15376 + }, + { + "epoch": 0.22247732917460866, + "grad_norm": 1.2179347276687622, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.6108, + "step": 15407 + }, + { + "epoch": 0.22292496967596603, + "grad_norm": 0.9314635396003723, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6109, + "step": 15438 + }, + { + "epoch": 0.2233726101773234, + "grad_norm": 0.9413474798202515, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6114, + "step": 15469 + }, + { + "epoch": 0.22382025067868075, + "grad_norm": 1.0556674003601074, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.6114, + "step": 15500 + }, + { + "epoch": 0.22426789118003812, + "grad_norm": 0.9029526114463806, + "learning_rate": 2.632819298478939e-05, + "loss": 0.6152, + "step": 15531 + }, + { + "epoch": 0.2247155316813955, + "grad_norm": 1.0554165840148926, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.6077, + "step": 15562 + }, + { + "epoch": 0.22516317218275284, + "grad_norm": 0.9897674918174744, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.6099, + "step": 15593 + }, + { + "epoch": 0.2256108126841102, + "grad_norm": 1.036055326461792, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.6115, + "step": 15624 + }, + { + "epoch": 0.22605845318546758, + "grad_norm": 1.1050103902816772, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.6132, + "step": 15655 + }, + { + "epoch": 0.22650609368682492, + "grad_norm": 1.0429555177688599, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6114, + "step": 15686 + }, + { + "epoch": 0.2269537341881823, + "grad_norm": 0.8952310681343079, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6155, + "step": 15717 + }, + { + "epoch": 0.22740137468953966, + "grad_norm": 0.9422932267189026, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.6115, + "step": 15748 + }, + { + "epoch": 0.227849015190897, + "grad_norm": 0.8615415096282959, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.6147, + "step": 15779 + }, + { + "epoch": 0.22829665569225438, + "grad_norm": 1.0953892469406128, + "learning_rate": 2.557292666450159e-05, + "loss": 0.6141, + "step": 15810 + }, + { + "epoch": 0.22874429619361172, + "grad_norm": 1.038050651550293, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.6118, + "step": 15841 + }, + { + "epoch": 0.2291919366949691, + "grad_norm": 1.0005477666854858, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.6071, + "step": 15872 + }, + { + "epoch": 0.22963957719632647, + "grad_norm": 1.1400154829025269, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.6073, + "step": 15903 + }, + { + "epoch": 0.2300872176976838, + "grad_norm": 0.9465575218200684, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6139, + "step": 15934 + }, + { + "epoch": 0.23053485819904118, + "grad_norm": 0.9524116516113281, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.6112, + "step": 15965 + }, + { + "epoch": 0.23098249870039855, + "grad_norm": 1.0909959077835083, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6062, + "step": 15996 + }, + { + "epoch": 0.2314301392017559, + "grad_norm": 0.9520925879478455, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6066, + "step": 16027 + }, + { + "epoch": 0.23187777970311327, + "grad_norm": 0.9747080206871033, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.6073, + "step": 16058 + }, + { + "epoch": 0.23232542020447064, + "grad_norm": 0.8820034265518188, + "learning_rate": 2.481713668624899e-05, + "loss": 0.6042, + "step": 16089 + }, + { + "epoch": 0.23277306070582798, + "grad_norm": 0.873534619808197, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.6115, + "step": 16120 + }, + { + "epoch": 0.23322070120718535, + "grad_norm": 1.0529483556747437, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.604, + "step": 16151 + }, + { + "epoch": 0.23366834170854273, + "grad_norm": 1.0236808061599731, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6109, + "step": 16182 + }, + { + "epoch": 0.23411598220990007, + "grad_norm": 0.926750898361206, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.6103, + "step": 16213 + }, + { + "epoch": 0.23456362271125744, + "grad_norm": 0.9616347551345825, + "learning_rate": 2.439728136286796e-05, + "loss": 0.6115, + "step": 16244 + }, + { + "epoch": 0.23501126321261478, + "grad_norm": 0.9181815981864929, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.6093, + "step": 16275 + }, + { + "epoch": 0.23545890371397216, + "grad_norm": 1.08785879611969, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6056, + "step": 16306 + }, + { + "epoch": 0.23590654421532953, + "grad_norm": 1.3456270694732666, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5999, + "step": 16337 + }, + { + "epoch": 0.23635418471668687, + "grad_norm": 0.9001489281654358, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6147, + "step": 16368 + }, + { + "epoch": 0.23680182521804424, + "grad_norm": 0.9228829145431519, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6089, + "step": 16399 + }, + { + "epoch": 0.23724946571940161, + "grad_norm": 0.8185672760009766, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6064, + "step": 16430 + }, + { + "epoch": 0.23769710622075896, + "grad_norm": 0.848855197429657, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.6078, + "step": 16461 + }, + { + "epoch": 0.23814474672211633, + "grad_norm": 1.5285366773605347, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6016, + "step": 16492 + }, + { + "epoch": 0.2385923872234737, + "grad_norm": 1.0771571397781372, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.6132, + "step": 16523 + }, + { + "epoch": 0.23904002772483104, + "grad_norm": 0.8987991809844971, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.6098, + "step": 16554 + }, + { + "epoch": 0.23948766822618842, + "grad_norm": 0.8981488943099976, + "learning_rate": 2.347436487983929e-05, + "loss": 0.6168, + "step": 16585 + }, + { + "epoch": 0.2399353087275458, + "grad_norm": 0.9029120802879333, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.6116, + "step": 16616 + }, + { + "epoch": 0.24038294922890313, + "grad_norm": 1.0437650680541992, + "learning_rate": 2.330674878704035e-05, + "loss": 0.599, + "step": 16647 + }, + { + "epoch": 0.2408305897302605, + "grad_norm": 0.9616511464118958, + "learning_rate": 2.322296892997561e-05, + "loss": 0.614, + "step": 16678 + }, + { + "epoch": 0.24127823023161785, + "grad_norm": 0.8985153436660767, + "learning_rate": 2.313920912646497e-05, + "loss": 0.6087, + "step": 16709 + }, + { + "epoch": 0.24172587073297522, + "grad_norm": 1.0448508262634277, + "learning_rate": 2.305547032172643e-05, + "loss": 0.6062, + "step": 16740 + }, + { + "epoch": 0.2421735112343326, + "grad_norm": 0.9185760021209717, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6099, + "step": 16771 + }, + { + "epoch": 0.24262115173568993, + "grad_norm": 1.1951557397842407, + "learning_rate": 2.288805948824212e-05, + "loss": 0.6091, + "step": 16802 + }, + { + "epoch": 0.2430687922370473, + "grad_norm": 0.8947639465332031, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6101, + "step": 16833 + }, + { + "epoch": 0.24351643273840468, + "grad_norm": 1.0335516929626465, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6053, + "step": 16864 + }, + { + "epoch": 0.24396407323976202, + "grad_norm": 0.9719113111495972, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.6093, + "step": 16895 + }, + { + "epoch": 0.2444117137411194, + "grad_norm": 0.9017343521118164, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6096, + "step": 16926 + }, + { + "epoch": 0.24485935424247676, + "grad_norm": 0.9254065155982971, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6031, + "step": 16957 + }, + { + "epoch": 0.2453069947438341, + "grad_norm": 0.9550548195838928, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6136, + "step": 16988 + }, + { + "epoch": 0.24575463524519148, + "grad_norm": 0.913746178150177, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6041, + "step": 17019 + }, + { + "epoch": 0.24620227574654885, + "grad_norm": 1.0998092889785767, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6031, + "step": 17050 + }, + { + "epoch": 0.2466499162479062, + "grad_norm": 0.8995510339736938, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.6043, + "step": 17081 + }, + { + "epoch": 0.24709755674926356, + "grad_norm": 1.0682373046875, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6091, + "step": 17112 + }, + { + "epoch": 0.2475451972506209, + "grad_norm": 0.9658533334732056, + "learning_rate": 2.196920634473666e-05, + "loss": 0.6062, + "step": 17143 + }, + { + "epoch": 0.24799283775197828, + "grad_norm": 0.9547036290168762, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6083, + "step": 17174 + }, + { + "epoch": 0.24844047825333565, + "grad_norm": 1.1252254247665405, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6047, + "step": 17205 + }, + { + "epoch": 0.248888118754693, + "grad_norm": 0.8774239420890808, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6091, + "step": 17236 + }, + { + "epoch": 0.24933575925605037, + "grad_norm": 0.9929160475730896, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.6037, + "step": 17267 + }, + { + "epoch": 0.24978339975740774, + "grad_norm": 1.0022073984146118, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6106, + "step": 17298 + }, + { + "epoch": 0.2502310402587651, + "grad_norm": 0.9709188938140869, + "learning_rate": 2.146967792431106e-05, + "loss": 0.6043, + "step": 17329 + }, + { + "epoch": 0.25067868076012245, + "grad_norm": 0.9158416986465454, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6002, + "step": 17360 + }, + { + "epoch": 0.2511263212614798, + "grad_norm": 1.045093059539795, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.6027, + "step": 17391 + }, + { + "epoch": 0.2515739617628372, + "grad_norm": 0.9029024243354797, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6095, + "step": 17422 + }, + { + "epoch": 0.2520216022641945, + "grad_norm": 1.0147509574890137, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.6072, + "step": 17453 + }, + { + "epoch": 0.2524692427655519, + "grad_norm": 0.9794949293136597, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6048, + "step": 17484 + }, + { + "epoch": 0.25291688326690925, + "grad_norm": 1.186495304107666, + "learning_rate": 2.097158366805287e-05, + "loss": 0.6079, + "step": 17515 + }, + { + "epoch": 0.2533645237682666, + "grad_norm": 0.9781451225280762, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.6033, + "step": 17546 + }, + { + "epoch": 0.253812164269624, + "grad_norm": 0.9464316964149475, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6057, + "step": 17577 + }, + { + "epoch": 0.25425980477098137, + "grad_norm": 0.9456629753112793, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.6039, + "step": 17608 + }, + { + "epoch": 0.2547074452723387, + "grad_norm": 0.9119940400123596, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6059, + "step": 17639 + }, + { + "epoch": 0.25515508577369606, + "grad_norm": 1.1009196043014526, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6081, + "step": 17670 + }, + { + "epoch": 0.2556027262750534, + "grad_norm": 1.010501503944397, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6081, + "step": 17701 + }, + { + "epoch": 0.2560503667764108, + "grad_norm": 0.9617831707000732, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.6066, + "step": 17732 + }, + { + "epoch": 0.25649800727776817, + "grad_norm": 1.0574779510498047, + "learning_rate": 2.031003855589343e-05, + "loss": 0.6025, + "step": 17763 + }, + { + "epoch": 0.2569456477791255, + "grad_norm": 0.9515939950942993, + "learning_rate": 2.022757379528727e-05, + "loss": 0.6147, + "step": 17794 + }, + { + "epoch": 0.25739328828048286, + "grad_norm": 0.8629471659660339, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6018, + "step": 17825 + }, + { + "epoch": 0.25784092878184023, + "grad_norm": 1.0973188877105713, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5974, + "step": 17856 + }, + { + "epoch": 0.2582885692831976, + "grad_norm": 1.0111137628555298, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6007, + "step": 17887 + }, + { + "epoch": 0.25873620978455497, + "grad_norm": 0.938352644443512, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6062, + "step": 17918 + }, + { + "epoch": 0.25918385028591234, + "grad_norm": 0.8754394054412842, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5999, + "step": 17949 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.9272274374961853, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5958, + "step": 17980 + }, + { + "epoch": 0.26007913128862703, + "grad_norm": 0.9161437749862671, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6068, + "step": 18011 + }, + { + "epoch": 0.2605267717899844, + "grad_norm": 1.0073903799057007, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6075, + "step": 18042 + }, + { + "epoch": 0.2609744122913418, + "grad_norm": 0.8590899705886841, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6017, + "step": 18073 + }, + { + "epoch": 0.26142205279269914, + "grad_norm": 0.9320747256278992, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6025, + "step": 18104 + }, + { + "epoch": 0.2618696932940565, + "grad_norm": 1.000109076499939, + "learning_rate": 1.932422022132275e-05, + "loss": 0.6025, + "step": 18135 + }, + { + "epoch": 0.26231733379541383, + "grad_norm": 0.8880858421325684, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6029, + "step": 18166 + }, + { + "epoch": 0.2627649742967712, + "grad_norm": 0.9839984178543091, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6009, + "step": 18197 + }, + { + "epoch": 0.2632126147981286, + "grad_norm": 0.9712537527084351, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6005, + "step": 18228 + }, + { + "epoch": 0.26366025529948595, + "grad_norm": 0.9593982100486755, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5967, + "step": 18259 + }, + { + "epoch": 0.2641078958008433, + "grad_norm": 0.9100329279899597, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5926, + "step": 18290 + }, + { + "epoch": 0.26455553630220063, + "grad_norm": 0.8858036398887634, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5972, + "step": 18321 + }, + { + "epoch": 0.265003176803558, + "grad_norm": 1.0127744674682617, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.6029, + "step": 18352 + }, + { + "epoch": 0.2654508173049154, + "grad_norm": 0.9447472095489502, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.6066, + "step": 18383 + }, + { + "epoch": 0.26589845780627275, + "grad_norm": 0.818134605884552, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5953, + "step": 18414 + }, + { + "epoch": 0.2663460983076301, + "grad_norm": 0.9979908466339111, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5978, + "step": 18445 + }, + { + "epoch": 0.2667937388089875, + "grad_norm": 0.8588074445724487, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.6004, + "step": 18476 + }, + { + "epoch": 0.2672413793103448, + "grad_norm": 0.9643010497093201, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.6071, + "step": 18507 + }, + { + "epoch": 0.2676890198117022, + "grad_norm": 0.8365680575370789, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5998, + "step": 18538 + }, + { + "epoch": 0.26813666031305955, + "grad_norm": 0.8986954689025879, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5982, + "step": 18569 + }, + { + "epoch": 0.2685843008144169, + "grad_norm": 1.0341336727142334, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.6008, + "step": 18600 + }, + { + "epoch": 0.2690319413157743, + "grad_norm": 1.0030567646026611, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.6011, + "step": 18631 + }, + { + "epoch": 0.2694795818171316, + "grad_norm": 0.9385823607444763, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.6077, + "step": 18662 + }, + { + "epoch": 0.269927222318489, + "grad_norm": 0.8827871680259705, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.6073, + "step": 18693 + }, + { + "epoch": 0.27037486281984635, + "grad_norm": 0.9739916324615479, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.595, + "step": 18724 + }, + { + "epoch": 0.2708225033212037, + "grad_norm": 0.9108980298042297, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.587, + "step": 18755 + }, + { + "epoch": 0.2712701438225611, + "grad_norm": 1.0579863786697388, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.6043, + "step": 18786 + }, + { + "epoch": 0.27171778432391847, + "grad_norm": 0.9881874322891235, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5905, + "step": 18817 + }, + { + "epoch": 0.2721654248252758, + "grad_norm": 0.9560896158218384, + "learning_rate": 1.746186742108387e-05, + "loss": 0.6033, + "step": 18848 + }, + { + "epoch": 0.27261306532663315, + "grad_norm": 0.9506632685661316, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5935, + "step": 18879 + }, + { + "epoch": 0.2730607058279905, + "grad_norm": 0.9935999512672424, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5997, + "step": 18910 + }, + { + "epoch": 0.2735083463293479, + "grad_norm": 0.9102685451507568, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.5982, + "step": 18941 + }, + { + "epoch": 0.27395598683070527, + "grad_norm": 1.0436829328536987, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.603, + "step": 18972 + }, + { + "epoch": 0.27440362733206264, + "grad_norm": 1.0441209077835083, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.5957, + "step": 19003 + }, + { + "epoch": 0.27485126783341995, + "grad_norm": 0.9903119206428528, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5972, + "step": 19034 + }, + { + "epoch": 0.2752989083347773, + "grad_norm": 0.8914598226547241, + "learning_rate": 1.690348705220684e-05, + "loss": 0.6027, + "step": 19065 + }, + { + "epoch": 0.2757465488361347, + "grad_norm": 0.9678306579589844, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5999, + "step": 19096 + }, + { + "epoch": 0.27619418933749207, + "grad_norm": 0.9159491658210754, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.6029, + "step": 19127 + }, + { + "epoch": 0.27664182983884944, + "grad_norm": 1.0948030948638916, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.604, + "step": 19158 + }, + { + "epoch": 0.27708947034020676, + "grad_norm": 0.866001546382904, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.5863, + "step": 19189 + }, + { + "epoch": 0.2775371108415641, + "grad_norm": 1.0278693437576294, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5937, + "step": 19220 + }, + { + "epoch": 0.2779847513429215, + "grad_norm": 0.9074748158454895, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.602, + "step": 19251 + }, + { + "epoch": 0.27843239184427887, + "grad_norm": 0.9951406717300415, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5982, + "step": 19282 + }, + { + "epoch": 0.27888003234563624, + "grad_norm": 1.0565474033355713, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.6008, + "step": 19313 + }, + { + "epoch": 0.2793276728469936, + "grad_norm": 0.9266191124916077, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5994, + "step": 19344 + }, + { + "epoch": 0.27977531334835093, + "grad_norm": 0.8990464806556702, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5984, + "step": 19375 + }, + { + "epoch": 0.2802229538497083, + "grad_norm": 0.9231170415878296, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5952, + "step": 19406 + }, + { + "epoch": 0.28067059435106567, + "grad_norm": 1.035946011543274, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5985, + "step": 19437 + }, + { + "epoch": 0.28111823485242304, + "grad_norm": 0.8787546157836914, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5999, + "step": 19468 + }, + { + "epoch": 0.2815658753537804, + "grad_norm": 0.8387063145637512, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5973, + "step": 19499 + }, + { + "epoch": 0.28201351585513773, + "grad_norm": 1.025985598564148, + "learning_rate": 1.572242550298298e-05, + "loss": 0.597, + "step": 19530 + }, + { + "epoch": 0.2824611563564951, + "grad_norm": 0.9072343707084656, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5868, + "step": 19561 + }, + { + "epoch": 0.2829087968578525, + "grad_norm": 0.9914515614509583, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5926, + "step": 19592 + }, + { + "epoch": 0.28335643735920985, + "grad_norm": 0.9568142294883728, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.6023, + "step": 19623 + }, + { + "epoch": 0.2838040778605672, + "grad_norm": 0.8508808016777039, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5991, + "step": 19654 + }, + { + "epoch": 0.2842517183619246, + "grad_norm": 0.9583558440208435, + "learning_rate": 1.533382561658241e-05, + "loss": 0.6031, + "step": 19685 + }, + { + "epoch": 0.2846993588632819, + "grad_norm": 0.9079626798629761, + "learning_rate": 1.525642938149541e-05, + "loss": 0.6021, + "step": 19716 + }, + { + "epoch": 0.2851469993646393, + "grad_norm": 0.8839224576950073, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5974, + "step": 19747 + }, + { + "epoch": 0.28559463986599665, + "grad_norm": 0.9244747161865234, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5951, + "step": 19778 + }, + { + "epoch": 0.286042280367354, + "grad_norm": 0.937430202960968, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5981, + "step": 19809 + }, + { + "epoch": 0.2864899208687114, + "grad_norm": 1.0390359163284302, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.6017, + "step": 19840 + }, + { + "epoch": 0.28693756137006876, + "grad_norm": 0.9726883172988892, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.5956, + "step": 19871 + }, + { + "epoch": 0.2873852018714261, + "grad_norm": 0.8611225485801697, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5875, + "step": 19902 + }, + { + "epoch": 0.28783284237278345, + "grad_norm": 0.9599292278289795, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.5948, + "step": 19933 + }, + { + "epoch": 0.2882804828741408, + "grad_norm": 0.9473167061805725, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5967, + "step": 19964 + }, + { + "epoch": 0.2887281233754982, + "grad_norm": 0.9631912708282471, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.5962, + "step": 19995 + }, + { + "epoch": 0.28917576387685556, + "grad_norm": 0.9674975872039795, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.59, + "step": 20026 + }, + { + "epoch": 0.2896234043782129, + "grad_norm": 1.1209561824798584, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.6032, + "step": 20057 + }, + { + "epoch": 0.29007104487957025, + "grad_norm": 0.9360538125038147, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5975, + "step": 20088 + }, + { + "epoch": 0.2905186853809276, + "grad_norm": 0.9791879653930664, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.6051, + "step": 20119 + }, + { + "epoch": 0.290966325882285, + "grad_norm": 1.0199767351150513, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.5912, + "step": 20150 + }, + { + "epoch": 0.29141396638364236, + "grad_norm": 0.8803568482398987, + "learning_rate": 1.410916653306954e-05, + "loss": 0.6007, + "step": 20181 + }, + { + "epoch": 0.29186160688499974, + "grad_norm": 0.9544051289558411, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5901, + "step": 20212 + }, + { + "epoch": 0.29230924738635705, + "grad_norm": 1.0661756992340088, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.595, + "step": 20243 + }, + { + "epoch": 0.2927568878877144, + "grad_norm": 0.9343761801719666, + "learning_rate": 1.388293959928911e-05, + "loss": 0.6016, + "step": 20274 + }, + { + "epoch": 0.2932045283890718, + "grad_norm": 1.0200270414352417, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.6, + "step": 20305 + }, + { + "epoch": 0.29365216889042917, + "grad_norm": 0.8452933430671692, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5978, + "step": 20336 + }, + { + "epoch": 0.29409980939178654, + "grad_norm": 1.000546932220459, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5933, + "step": 20367 + }, + { + "epoch": 0.29454744989314385, + "grad_norm": 0.9047265648841858, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5954, + "step": 20398 + }, + { + "epoch": 0.2949950903945012, + "grad_norm": 0.877160906791687, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.5921, + "step": 20429 + }, + { + "epoch": 0.2954427308958586, + "grad_norm": 1.0855872631072998, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5992, + "step": 20460 + }, + { + "epoch": 0.29589037139721597, + "grad_norm": 1.006057620048523, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5916, + "step": 20491 + }, + { + "epoch": 0.29633801189857334, + "grad_norm": 1.003037929534912, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5918, + "step": 20522 + }, + { + "epoch": 0.2967856523999307, + "grad_norm": 0.9983749389648438, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5922, + "step": 20553 + }, + { + "epoch": 0.297233292901288, + "grad_norm": 1.0387030839920044, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5991, + "step": 20584 + }, + { + "epoch": 0.2976809334026454, + "grad_norm": 0.8586576581001282, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.6001, + "step": 20615 + }, + { + "epoch": 0.29812857390400277, + "grad_norm": 0.9596696496009827, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.5988, + "step": 20646 + }, + { + "epoch": 0.29857621440536014, + "grad_norm": 0.9584054350852966, + "learning_rate": 1.291596270869846e-05, + "loss": 0.5946, + "step": 20677 + }, + { + "epoch": 0.2990238549067175, + "grad_norm": 1.02814519405365, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5941, + "step": 20708 + }, + { + "epoch": 0.2994714954080749, + "grad_norm": 1.1163685321807861, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.6002, + "step": 20739 + }, + { + "epoch": 0.2999191359094322, + "grad_norm": 0.9234864711761475, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.5919, + "step": 20770 + }, + { + "epoch": 0.30036677641078957, + "grad_norm": 0.9402379393577576, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5931, + "step": 20801 + }, + { + "epoch": 0.30081441691214694, + "grad_norm": 0.9681121110916138, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5904, + "step": 20832 + }, + { + "epoch": 0.3012620574135043, + "grad_norm": 0.9508892893791199, + "learning_rate": 1.247732733176724e-05, + "loss": 0.5997, + "step": 20863 + }, + { + "epoch": 0.3017096979148617, + "grad_norm": 0.8664924502372742, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.594, + "step": 20894 + }, + { + "epoch": 0.302157338416219, + "grad_norm": 0.8919743299484253, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.5873, + "step": 20925 + }, + { + "epoch": 0.3026049789175764, + "grad_norm": 0.9090976119041443, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5966, + "step": 20956 + }, + { + "epoch": 0.30305261941893374, + "grad_norm": 0.8878434896469116, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5939, + "step": 20987 + }, + { + "epoch": 0.3035002599202911, + "grad_norm": 0.8897850513458252, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.5978, + "step": 21018 + }, + { + "epoch": 0.3039479004216485, + "grad_norm": 0.8866651654243469, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.5873, + "step": 21049 + }, + { + "epoch": 0.30439554092300586, + "grad_norm": 0.8519348502159119, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5991, + "step": 21080 + }, + { + "epoch": 0.3048431814243632, + "grad_norm": 1.146201252937317, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5928, + "step": 21111 + }, + { + "epoch": 0.30529082192572055, + "grad_norm": 1.0777043104171753, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5966, + "step": 21142 + }, + { + "epoch": 0.3057384624270779, + "grad_norm": 1.0404378175735474, + "learning_rate": 1.175766039353062e-05, + "loss": 0.607, + "step": 21173 + }, + { + "epoch": 0.3061861029284353, + "grad_norm": 0.8684154152870178, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.5992, + "step": 21204 + }, + { + "epoch": 0.30663374342979266, + "grad_norm": 0.9449039101600647, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.5937, + "step": 21235 + }, + { + "epoch": 0.30708138393115, + "grad_norm": 0.9459973573684692, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5953, + "step": 21266 + }, + { + "epoch": 0.30752902443250735, + "grad_norm": 0.8467513918876648, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5937, + "step": 21297 + }, + { + "epoch": 0.3079766649338647, + "grad_norm": 0.953157901763916, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.592, + "step": 21328 + }, + { + "epoch": 0.3084243054352221, + "grad_norm": 0.9762019515037537, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.6009, + "step": 21359 + }, + { + "epoch": 0.30887194593657946, + "grad_norm": 0.7931903600692749, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.5951, + "step": 21390 + }, + { + "epoch": 0.30931958643793683, + "grad_norm": 1.042128324508667, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5951, + "step": 21421 + }, + { + "epoch": 0.30976722693929415, + "grad_norm": 0.9942502379417419, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5945, + "step": 21452 + }, + { + "epoch": 0.3102148674406515, + "grad_norm": 0.9771155118942261, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5929, + "step": 21483 + }, + { + "epoch": 0.3106625079420089, + "grad_norm": 0.9390444159507751, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5944, + "step": 21514 + }, + { + "epoch": 0.31111014844336626, + "grad_norm": 0.9289217591285706, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5902, + "step": 21545 + }, + { + "epoch": 0.31155778894472363, + "grad_norm": 0.8977670669555664, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5951, + "step": 21576 + }, + { + "epoch": 0.312005429446081, + "grad_norm": 0.9898940920829773, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5997, + "step": 21607 + }, + { + "epoch": 0.3124530699474383, + "grad_norm": 0.9756447076797485, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5969, + "step": 21638 + }, + { + "epoch": 0.3129007104487957, + "grad_norm": 1.0475540161132812, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5908, + "step": 21669 + }, + { + "epoch": 0.31334835095015307, + "grad_norm": 0.9765790700912476, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5903, + "step": 21700 + }, + { + "epoch": 0.31379599145151044, + "grad_norm": 0.8677023649215698, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5907, + "step": 21731 + }, + { + "epoch": 0.3142436319528678, + "grad_norm": 0.937731921672821, + "learning_rate": 1.043211714185722e-05, + "loss": 0.603, + "step": 21762 + }, + { + "epoch": 0.3146912724542251, + "grad_norm": 0.8233932256698608, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5955, + "step": 21793 + }, + { + "epoch": 0.3151389129555825, + "grad_norm": 1.1260769367218018, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.605, + "step": 21824 + }, + { + "epoch": 0.31558655345693987, + "grad_norm": 0.9366801977157593, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.5963, + "step": 21855 + }, + { + "epoch": 0.31603419395829724, + "grad_norm": 0.871155321598053, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5876, + "step": 21886 + }, + { + "epoch": 0.3164818344596546, + "grad_norm": 0.9532550573348999, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5863, + "step": 21917 + }, + { + "epoch": 0.316929474961012, + "grad_norm": 1.045569658279419, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5899, + "step": 21948 + }, + { + "epoch": 0.3173771154623693, + "grad_norm": 0.9291980862617493, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5904, + "step": 21979 + }, + { + "epoch": 0.31782475596372667, + "grad_norm": 0.8881365060806274, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5997, + "step": 22010 + }, + { + "epoch": 0.31827239646508404, + "grad_norm": 0.9601870179176331, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5965, + "step": 22041 + }, + { + "epoch": 0.3187200369664414, + "grad_norm": 0.8519198298454285, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5887, + "step": 22072 + }, + { + "epoch": 0.3191676774677988, + "grad_norm": 0.9262305498123169, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5846, + "step": 22103 + }, + { + "epoch": 0.3196153179691561, + "grad_norm": 1.0259448289871216, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5865, + "step": 22134 + }, + { + "epoch": 0.32006295847051347, + "grad_norm": 0.9057780504226685, + "learning_rate": 9.559031610258007e-06, + "loss": 0.592, + "step": 22165 + }, + { + "epoch": 0.32051059897187084, + "grad_norm": 0.905604362487793, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5958, + "step": 22196 + }, + { + "epoch": 0.3209582394732282, + "grad_norm": 0.8837811946868896, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5955, + "step": 22227 + }, + { + "epoch": 0.3214058799745856, + "grad_norm": 0.9125133752822876, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5915, + "step": 22258 + }, + { + "epoch": 0.32185352047594296, + "grad_norm": 0.8553541898727417, + "learning_rate": 9.29623762843734e-06, + "loss": 0.586, + "step": 22289 + }, + { + "epoch": 0.32230116097730027, + "grad_norm": 0.8609781861305237, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5867, + "step": 22320 + }, + { + "epoch": 0.32274880147865764, + "grad_norm": 0.8896780014038086, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5862, + "step": 22351 + }, + { + "epoch": 0.323196441980015, + "grad_norm": 1.0318437814712524, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5906, + "step": 22382 + }, + { + "epoch": 0.3236440824813724, + "grad_norm": 1.0346667766571045, + "learning_rate": 9.036279043803565e-06, + "loss": 0.594, + "step": 22413 + }, + { + "epoch": 0.32409172298272976, + "grad_norm": 0.899023175239563, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5983, + "step": 22444 + }, + { + "epoch": 0.32453936348408713, + "grad_norm": 0.8427733182907104, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5928, + "step": 22475 + }, + { + "epoch": 0.32498700398544444, + "grad_norm": 0.8469851613044739, + "learning_rate": 8.843199089695293e-06, + "loss": 0.5867, + "step": 22506 + }, + { + "epoch": 0.3254346444868018, + "grad_norm": 0.8703016638755798, + "learning_rate": 8.779202793251311e-06, + "loss": 0.5894, + "step": 22537 + }, + { + "epoch": 0.3258822849881592, + "grad_norm": 0.9438649415969849, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5949, + "step": 22568 + }, + { + "epoch": 0.32632992548951656, + "grad_norm": 0.8361387848854065, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5956, + "step": 22599 + }, + { + "epoch": 0.32677756599087393, + "grad_norm": 0.8810434341430664, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5904, + "step": 22630 + }, + { + "epoch": 0.32722520649223125, + "grad_norm": 0.9140039682388306, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5871, + "step": 22661 + }, + { + "epoch": 0.3276728469935886, + "grad_norm": 0.8848084211349487, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5973, + "step": 22692 + }, + { + "epoch": 0.328120487494946, + "grad_norm": 0.8551177382469177, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5925, + "step": 22723 + }, + { + "epoch": 0.32856812799630336, + "grad_norm": 0.9873132705688477, + "learning_rate": 8.336394285228017e-06, + "loss": 0.592, + "step": 22754 + }, + { + "epoch": 0.32901576849766073, + "grad_norm": 0.9582761526107788, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5957, + "step": 22785 + }, + { + "epoch": 0.3294634089990181, + "grad_norm": 0.9337429404258728, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5893, + "step": 22816 + }, + { + "epoch": 0.3299110495003754, + "grad_norm": 0.8926681280136108, + "learning_rate": 8.149425683199823e-06, + "loss": 0.593, + "step": 22847 + }, + { + "epoch": 0.3303586900017328, + "grad_norm": 0.8568328022956848, + "learning_rate": 8.08748219313325e-06, + "loss": 0.5895, + "step": 22878 + }, + { + "epoch": 0.33080633050309016, + "grad_norm": 0.971608579158783, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5946, + "step": 22909 + }, + { + "epoch": 0.33125397100444753, + "grad_norm": 0.9121518135070801, + "learning_rate": 7.964168476264508e-06, + "loss": 0.592, + "step": 22940 + }, + { + "epoch": 0.3317016115058049, + "grad_norm": 0.9045124650001526, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5915, + "step": 22971 + }, + { + "epoch": 0.3321492520071622, + "grad_norm": 0.8435531258583069, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5902, + "step": 23002 + }, + { + "epoch": 0.3325968925085196, + "grad_norm": 0.9218406677246094, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5949, + "step": 23033 + }, + { + "epoch": 0.33304453300987696, + "grad_norm": 0.8910061120986938, + "learning_rate": 7.719853532125227e-06, + "loss": 0.589, + "step": 23064 + }, + { + "epoch": 0.33349217351123434, + "grad_norm": 0.9808411598205566, + "learning_rate": 7.65926058865258e-06, + "loss": 0.592, + "step": 23095 + }, + { + "epoch": 0.3339398140125917, + "grad_norm": 0.9253140091896057, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5867, + "step": 23126 + }, + { + "epoch": 0.3343874545139491, + "grad_norm": 0.8689921498298645, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5817, + "step": 23157 + }, + { + "epoch": 0.3348350950153064, + "grad_norm": 0.9247636198997498, + "learning_rate": 7.478658609642211e-06, + "loss": 0.5865, + "step": 23188 + }, + { + "epoch": 0.33528273551666377, + "grad_norm": 0.99556565284729, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5936, + "step": 23219 + }, + { + "epoch": 0.33573037601802114, + "grad_norm": 0.8474295139312744, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5846, + "step": 23250 + }, + { + "epoch": 0.3361780165193785, + "grad_norm": 0.9600043892860413, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5858, + "step": 23281 + }, + { + "epoch": 0.3366256570207359, + "grad_norm": 0.9534376263618469, + "learning_rate": 7.240627257831847e-06, + "loss": 0.588, + "step": 23312 + }, + { + "epoch": 0.33707329752209325, + "grad_norm": 0.9994376301765442, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.5924, + "step": 23343 + }, + { + "epoch": 0.33752093802345057, + "grad_norm": 0.9056828618049622, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5842, + "step": 23374 + }, + { + "epoch": 0.33796857852480794, + "grad_norm": 0.9004384875297546, + "learning_rate": 7.064205712766226e-06, + "loss": 0.5829, + "step": 23405 + }, + { + "epoch": 0.3384162190261653, + "grad_norm": 0.9889013171195984, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5825, + "step": 23436 + }, + { + "epoch": 0.3388638595275227, + "grad_norm": 0.8478637933731079, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5857, + "step": 23467 + }, + { + "epoch": 0.33931150002888005, + "grad_norm": 0.825160801410675, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5922, + "step": 23498 + }, + { + "epoch": 0.33975914053023737, + "grad_norm": 0.968928337097168, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5912, + "step": 23529 + }, + { + "epoch": 0.34020678103159474, + "grad_norm": 0.9048051238059998, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5847, + "step": 23560 + }, + { + "epoch": 0.3406544215329521, + "grad_norm": 0.9165478348731995, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5914, + "step": 23591 + }, + { + "epoch": 0.3411020620343095, + "grad_norm": 0.8867752552032471, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5854, + "step": 23622 + }, + { + "epoch": 0.34154970253566685, + "grad_norm": 0.8337939977645874, + "learning_rate": 6.602702003200872e-06, + "loss": 0.5887, + "step": 23653 + }, + { + "epoch": 0.3419973430370242, + "grad_norm": 1.0237765312194824, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5909, + "step": 23684 + }, + { + "epoch": 0.34244498353838154, + "grad_norm": 0.9445424675941467, + "learning_rate": 6.489389252651057e-06, + "loss": 0.593, + "step": 23715 + }, + { + "epoch": 0.3428926240397389, + "grad_norm": 0.874966025352478, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.5858, + "step": 23746 + }, + { + "epoch": 0.3433402645410963, + "grad_norm": 1.0574617385864258, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5914, + "step": 23777 + }, + { + "epoch": 0.34378790504245366, + "grad_norm": 0.8537029027938843, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5839, + "step": 23808 + }, + { + "epoch": 0.34423554554381103, + "grad_norm": 0.9089046716690063, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5846, + "step": 23839 + }, + { + "epoch": 0.34468318604516834, + "grad_norm": 1.014028787612915, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5888, + "step": 23870 + }, + { + "epoch": 0.3451308265465257, + "grad_norm": 1.165295958518982, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5879, + "step": 23901 + }, + { + "epoch": 0.3455784670478831, + "grad_norm": 0.8614928126335144, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5849, + "step": 23932 + }, + { + "epoch": 0.34602610754924046, + "grad_norm": 0.8964920043945312, + "learning_rate": 6.044544397429958e-06, + "loss": 0.5845, + "step": 23963 + }, + { + "epoch": 0.34647374805059783, + "grad_norm": 0.8400657176971436, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5826, + "step": 23994 + }, + { + "epoch": 0.3469213885519552, + "grad_norm": 0.892620325088501, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5825, + "step": 24025 + }, + { + "epoch": 0.3473690290533125, + "grad_norm": 0.967951774597168, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5906, + "step": 24056 + }, + { + "epoch": 0.3478166695546699, + "grad_norm": 0.9834921360015869, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5829, + "step": 24087 + }, + { + "epoch": 0.34826431005602726, + "grad_norm": 0.9132741689682007, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5922, + "step": 24118 + }, + { + "epoch": 0.34871195055738463, + "grad_norm": 0.9170505404472351, + "learning_rate": 5.719877233394228e-06, + "loss": 0.592, + "step": 24149 + }, + { + "epoch": 0.349159591058742, + "grad_norm": 0.9209049940109253, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5831, + "step": 24180 + }, + { + "epoch": 0.3496072315600994, + "grad_norm": 0.9120186567306519, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5855, + "step": 24211 + }, + { + "epoch": 0.3500548720614567, + "grad_norm": 0.8754975199699402, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5837, + "step": 24242 + }, + { + "epoch": 0.35050251256281406, + "grad_norm": 0.8505347967147827, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5917, + "step": 24273 + }, + { + "epoch": 0.35095015306417143, + "grad_norm": 0.8900067806243896, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.586, + "step": 24304 + }, + { + "epoch": 0.3513977935655288, + "grad_norm": 0.9413267374038696, + "learning_rate": 5.403042459894597e-06, + "loss": 0.5842, + "step": 24335 + }, + { + "epoch": 0.3518454340668862, + "grad_norm": 0.8651320338249207, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5869, + "step": 24366 + }, + { + "epoch": 0.3522930745682435, + "grad_norm": 0.8826769590377808, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5916, + "step": 24397 + }, + { + "epoch": 0.35274071506960086, + "grad_norm": 0.8735561966896057, + "learning_rate": 5.247602567671625e-06, + "loss": 0.5948, + "step": 24428 + }, + { + "epoch": 0.35318835557095823, + "grad_norm": 0.9097628593444824, + "learning_rate": 5.196234299402603e-06, + "loss": 0.5832, + "step": 24459 + }, + { + "epoch": 0.3536359960723156, + "grad_norm": 0.8925328254699707, + "learning_rate": 5.145089513937865e-06, + "loss": 0.5855, + "step": 24490 + }, + { + "epoch": 0.354083636573673, + "grad_norm": 0.8319926857948303, + "learning_rate": 5.094168788439369e-06, + "loss": 0.596, + "step": 24521 + }, + { + "epoch": 0.35453127707503035, + "grad_norm": 0.8705253005027771, + "learning_rate": 5.043472697540594e-06, + "loss": 0.5807, + "step": 24552 + }, + { + "epoch": 0.35497891757638766, + "grad_norm": 0.8858280777931213, + "learning_rate": 4.993001813340012e-06, + "loss": 0.5922, + "step": 24583 + }, + { + "epoch": 0.35542655807774504, + "grad_norm": 0.8770239949226379, + "learning_rate": 4.942756705394702e-06, + "loss": 0.5959, + "step": 24614 + }, + { + "epoch": 0.3558741985791024, + "grad_norm": 0.9590134620666504, + "learning_rate": 4.892737940713884e-06, + "loss": 0.5944, + "step": 24645 + }, + { + "epoch": 0.3563218390804598, + "grad_norm": 0.9162303805351257, + "learning_rate": 4.842946083752511e-06, + "loss": 0.5858, + "step": 24676 + }, + { + "epoch": 0.35676947958181715, + "grad_norm": 1.051709771156311, + "learning_rate": 4.79338169640493e-06, + "loss": 0.5842, + "step": 24707 + }, + { + "epoch": 0.35721712008317447, + "grad_norm": 0.9024022221565247, + "learning_rate": 4.74404533799851e-06, + "loss": 0.5836, + "step": 24738 + }, + { + "epoch": 0.35766476058453184, + "grad_norm": 0.9624181985855103, + "learning_rate": 4.694937565287344e-06, + "loss": 0.5884, + "step": 24769 + }, + { + "epoch": 0.3581124010858892, + "grad_norm": 0.9708924889564514, + "learning_rate": 4.646058932445985e-06, + "loss": 0.5871, + "step": 24800 + }, + { + "epoch": 0.3585600415872466, + "grad_norm": 0.9564160704612732, + "learning_rate": 4.597409991063148e-06, + "loss": 0.5838, + "step": 24831 + }, + { + "epoch": 0.35900768208860395, + "grad_norm": 0.9520463347434998, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.5891, + "step": 24862 + }, + { + "epoch": 0.3594553225899613, + "grad_norm": 0.8748462200164795, + "learning_rate": 4.500803376061608e-06, + "loss": 0.5871, + "step": 24893 + }, + { + "epoch": 0.35990296309131864, + "grad_norm": 0.8078939318656921, + "learning_rate": 4.45284679263541e-06, + "loss": 0.5828, + "step": 24924 + }, + { + "epoch": 0.360350603592676, + "grad_norm": 0.8798905611038208, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.5945, + "step": 24955 + }, + { + "epoch": 0.3607982440940334, + "grad_norm": 1.0122028589248657, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.5906, + "step": 24986 + }, + { + "epoch": 0.36124588459539075, + "grad_norm": 0.8810189962387085, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.585, + "step": 25017 + }, + { + "epoch": 0.3616935250967481, + "grad_norm": 0.9921602606773376, + "learning_rate": 4.263344549792487e-06, + "loss": 0.5845, + "step": 25048 + }, + { + "epoch": 0.3621411655981055, + "grad_norm": 0.9266677498817444, + "learning_rate": 4.216552684934056e-06, + "loss": 0.5872, + "step": 25079 + }, + { + "epoch": 0.3625888060994628, + "grad_norm": 0.9869902729988098, + "learning_rate": 4.169995358453777e-06, + "loss": 0.5839, + "step": 25110 + }, + { + "epoch": 0.3630364466008202, + "grad_norm": 0.908527135848999, + "learning_rate": 4.123673095744757e-06, + "loss": 0.5816, + "step": 25141 + }, + { + "epoch": 0.36348408710217756, + "grad_norm": 0.9539284706115723, + "learning_rate": 4.077586419547435e-06, + "loss": 0.5895, + "step": 25172 + }, + { + "epoch": 0.3639317276035349, + "grad_norm": 0.9441227912902832, + "learning_rate": 4.03173584994368e-06, + "loss": 0.5818, + "step": 25203 + }, + { + "epoch": 0.3643793681048923, + "grad_norm": 0.9245622754096985, + "learning_rate": 3.986121904350948e-06, + "loss": 0.5894, + "step": 25234 + }, + { + "epoch": 0.3648270086062496, + "grad_norm": 0.9282262921333313, + "learning_rate": 3.940745097516407e-06, + "loss": 0.5867, + "step": 25265 + }, + { + "epoch": 0.365274649107607, + "grad_norm": 0.8341302871704102, + "learning_rate": 3.89560594151116e-06, + "loss": 0.5927, + "step": 25296 + }, + { + "epoch": 0.36572228960896436, + "grad_norm": 0.8950303792953491, + "learning_rate": 3.850704945724456e-06, + "loss": 0.5853, + "step": 25327 + }, + { + "epoch": 0.36616993011032173, + "grad_norm": 0.8331125378608704, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.5913, + "step": 25358 + }, + { + "epoch": 0.3666175706116791, + "grad_norm": 0.9764972925186157, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.5824, + "step": 25389 + }, + { + "epoch": 0.36706521111303647, + "grad_norm": 0.8486316800117493, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.5886, + "step": 25420 + }, + { + "epoch": 0.3675128516143938, + "grad_norm": 0.8741101622581482, + "learning_rate": 3.673492658361677e-06, + "loss": 0.5908, + "step": 25451 + }, + { + "epoch": 0.36796049211575116, + "grad_norm": 0.9734169244766235, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.5755, + "step": 25482 + }, + { + "epoch": 0.36840813261710853, + "grad_norm": 0.9409371018409729, + "learning_rate": 3.586328522034607e-06, + "loss": 0.5851, + "step": 25513 + }, + { + "epoch": 0.3688557731184659, + "grad_norm": 0.9707832336425781, + "learning_rate": 3.543108684200838e-06, + "loss": 0.5846, + "step": 25544 + }, + { + "epoch": 0.3693034136198233, + "grad_norm": 0.9187780618667603, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.5863, + "step": 25575 + }, + { + "epoch": 0.3697510541211806, + "grad_norm": 0.8471042513847351, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.5824, + "step": 25606 + }, + { + "epoch": 0.37019869462253796, + "grad_norm": 0.8648544549942017, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.585, + "step": 25637 + }, + { + "epoch": 0.37064633512389533, + "grad_norm": 0.8514716625213623, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.5853, + "step": 25668 + }, + { + "epoch": 0.3710939756252527, + "grad_norm": 0.8402500152587891, + "learning_rate": 3.33065122541244e-06, + "loss": 0.5827, + "step": 25699 + }, + { + "epoch": 0.3715416161266101, + "grad_norm": 0.9909188747406006, + "learning_rate": 3.288891436313385e-06, + "loss": 0.582, + "step": 25730 + }, + { + "epoch": 0.37198925662796745, + "grad_norm": 0.9633392691612244, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.579, + "step": 25761 + }, + { + "epoch": 0.37243689712932476, + "grad_norm": 0.8810042142868042, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.5849, + "step": 25792 + }, + { + "epoch": 0.37288453763068213, + "grad_norm": 0.9136834144592285, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.5854, + "step": 25823 + }, + { + "epoch": 0.3733321781320395, + "grad_norm": 0.9164252877235413, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.5861, + "step": 25854 + }, + { + "epoch": 0.3737798186333969, + "grad_norm": 0.8443343639373779, + "learning_rate": 3.0837769226467e-06, + "loss": 0.5806, + "step": 25885 + }, + { + "epoch": 0.37422745913475425, + "grad_norm": 0.8546344637870789, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.587, + "step": 25916 + }, + { + "epoch": 0.3746750996361116, + "grad_norm": 0.8392828702926636, + "learning_rate": 3.003459147240753e-06, + "loss": 0.5838, + "step": 25947 + }, + { + "epoch": 0.37512274013746894, + "grad_norm": 0.876781165599823, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.596, + "step": 25978 + }, + { + "epoch": 0.3755703806388263, + "grad_norm": 0.9352492094039917, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.5832, + "step": 26009 + }, + { + "epoch": 0.3760180211401837, + "grad_norm": 0.9210870265960693, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.5791, + "step": 26040 + }, + { + "epoch": 0.37646566164154105, + "grad_norm": 0.9076164960861206, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.5811, + "step": 26071 + }, + { + "epoch": 0.3769133021428984, + "grad_norm": 0.8342217803001404, + "learning_rate": 2.807016506436172e-06, + "loss": 0.5822, + "step": 26102 + }, + { + "epoch": 0.37736094264425574, + "grad_norm": 0.9161880612373352, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.587, + "step": 26133 + }, + { + "epoch": 0.3778085831456131, + "grad_norm": 0.8972461223602295, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.5778, + "step": 26164 + }, + { + "epoch": 0.3782562236469705, + "grad_norm": 0.9267194867134094, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.5824, + "step": 26195 + }, + { + "epoch": 0.37870386414832785, + "grad_norm": 0.8432844877243042, + "learning_rate": 2.6543676975817223e-06, + "loss": 0.5895, + "step": 26226 + }, + { + "epoch": 0.3791515046496852, + "grad_norm": 0.8702236413955688, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.5872, + "step": 26257 + }, + { + "epoch": 0.3795991451510426, + "grad_norm": 0.9064545631408691, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.5792, + "step": 26288 + }, + { + "epoch": 0.3800467856523999, + "grad_norm": 0.9529021978378296, + "learning_rate": 2.54252733160808e-06, + "loss": 0.5827, + "step": 26319 + }, + { + "epoch": 0.3804944261537573, + "grad_norm": 1.0374935865402222, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.5907, + "step": 26350 + }, + { + "epoch": 0.38094206665511465, + "grad_norm": 0.8641102910041809, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.5825, + "step": 26381 + }, + { + "epoch": 0.381389707156472, + "grad_norm": 0.9310511350631714, + "learning_rate": 2.432967814215639e-06, + "loss": 0.5859, + "step": 26412 + }, + { + "epoch": 0.3818373476578294, + "grad_norm": 0.8742653131484985, + "learning_rate": 2.396956760238794e-06, + "loss": 0.5831, + "step": 26443 + }, + { + "epoch": 0.3822849881591867, + "grad_norm": 0.9148630499839783, + "learning_rate": 2.361200778532796e-06, + "loss": 0.5843, + "step": 26474 + }, + { + "epoch": 0.3827326286605441, + "grad_norm": 0.7990172505378723, + "learning_rate": 2.325700272599049e-06, + "loss": 0.5867, + "step": 26505 + }, + { + "epoch": 0.38318026916190145, + "grad_norm": 1.0082364082336426, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.5883, + "step": 26536 + }, + { + "epoch": 0.3836279096632588, + "grad_norm": 1.061032772064209, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.5804, + "step": 26567 + }, + { + "epoch": 0.3840755501646162, + "grad_norm": 0.867182195186615, + "learning_rate": 2.220735601173002e-06, + "loss": 0.5862, + "step": 26598 + }, + { + "epoch": 0.38452319066597357, + "grad_norm": 0.9035846590995789, + "learning_rate": 2.186260975614382e-06, + "loss": 0.5798, + "step": 26629 + }, + { + "epoch": 0.3849708311673309, + "grad_norm": 0.9290494322776794, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.583, + "step": 26660 + }, + { + "epoch": 0.38541847166868826, + "grad_norm": 0.8391757011413574, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.5819, + "step": 26691 + }, + { + "epoch": 0.3858661121700456, + "grad_norm": 0.8569139242172241, + "learning_rate": 2.084383340238455e-06, + "loss": 0.585, + "step": 26722 + }, + { + "epoch": 0.386313752671403, + "grad_norm": 0.8917425870895386, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.5914, + "step": 26753 + }, + { + "epoch": 0.38676139317276037, + "grad_norm": 0.8586505651473999, + "learning_rate": 2.017757276036403e-06, + "loss": 0.5842, + "step": 26784 + }, + { + "epoch": 0.38720903367411774, + "grad_norm": 0.8897309303283691, + "learning_rate": 1.984833083927726e-06, + "loss": 0.58, + "step": 26815 + }, + { + "epoch": 0.38765667417547506, + "grad_norm": 0.8957898616790771, + "learning_rate": 1.952168614849581e-06, + "loss": 0.5804, + "step": 26846 + }, + { + "epoch": 0.38810431467683243, + "grad_norm": 0.8361022472381592, + "learning_rate": 1.919764237416058e-06, + "loss": 0.5764, + "step": 26877 + }, + { + "epoch": 0.3885519551781898, + "grad_norm": 0.9136033058166504, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.5864, + "step": 26908 + }, + { + "epoch": 0.38899959567954717, + "grad_norm": 0.8564227819442749, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.5881, + "step": 26939 + }, + { + "epoch": 0.38944723618090454, + "grad_norm": 0.9318363666534424, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.579, + "step": 26970 + }, + { + "epoch": 0.38989487668226186, + "grad_norm": 0.8604468703269958, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.5795, + "step": 27001 + }, + { + "epoch": 0.39034251718361923, + "grad_norm": 0.9203160405158997, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.5821, + "step": 27032 + }, + { + "epoch": 0.3907901576849766, + "grad_norm": 1.0079877376556396, + "learning_rate": 1.730820169401584e-06, + "loss": 0.5752, + "step": 27063 + }, + { + "epoch": 0.391237798186334, + "grad_norm": 0.8484696745872498, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.5825, + "step": 27094 + }, + { + "epoch": 0.39168543868769135, + "grad_norm": 0.969445526599884, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.5829, + "step": 27125 + }, + { + "epoch": 0.3921330791890487, + "grad_norm": 0.902077853679657, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.5829, + "step": 27156 + }, + { + "epoch": 0.39258071969040603, + "grad_norm": 0.9344344735145569, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.5901, + "step": 27187 + }, + { + "epoch": 0.3930283601917634, + "grad_norm": 0.8951054811477661, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.581, + "step": 27218 + }, + { + "epoch": 0.3934760006931208, + "grad_norm": 0.8078446388244629, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.5862, + "step": 27249 + }, + { + "epoch": 0.39392364119447815, + "grad_norm": 0.8421075940132141, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.59, + "step": 27280 + }, + { + "epoch": 0.3943712816958355, + "grad_norm": 0.8337625861167908, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.5828, + "step": 27311 + }, + { + "epoch": 0.39481892219719283, + "grad_norm": 0.984302282333374, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.5824, + "step": 27342 + }, + { + "epoch": 0.3952665626985502, + "grad_norm": 0.9869680404663086, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.5913, + "step": 27373 + }, + { + "epoch": 0.3957142031999076, + "grad_norm": 0.8444851040840149, + "learning_rate": 1.409026746485978e-06, + "loss": 0.5821, + "step": 27404 + }, + { + "epoch": 0.39616184370126495, + "grad_norm": 0.776551365852356, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.5842, + "step": 27435 + }, + { + "epoch": 0.3966094842026223, + "grad_norm": 0.8994253277778625, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.5852, + "step": 27466 + }, + { + "epoch": 0.3970571247039797, + "grad_norm": 0.9431787729263306, + "learning_rate": 1.3268374740224548e-06, + "loss": 0.5814, + "step": 27497 + }, + { + "epoch": 0.397504765205337, + "grad_norm": 0.8683516383171082, + "learning_rate": 1.2999749375008807e-06, + "loss": 0.584, + "step": 27528 + }, + { + "epoch": 0.3979524057066944, + "grad_norm": 0.8702425956726074, + "learning_rate": 1.2733798525409346e-06, + "loss": 0.5824, + "step": 27559 + }, + { + "epoch": 0.39840004620805175, + "grad_norm": 0.9975656270980835, + "learning_rate": 1.2470525192645383e-06, + "loss": 0.5791, + "step": 27590 + }, + { + "epoch": 0.3988476867094091, + "grad_norm": 0.8925862312316895, + "learning_rate": 1.2209932347720666e-06, + "loss": 0.5793, + "step": 27621 + }, + { + "epoch": 0.3992953272107665, + "grad_norm": 0.8167952299118042, + "learning_rate": 1.1952022931389972e-06, + "loss": 0.5788, + "step": 27652 + }, + { + "epoch": 0.39974296771212386, + "grad_norm": 0.8877468705177307, + "learning_rate": 1.1696799854126083e-06, + "loss": 0.5879, + "step": 27683 + }, + { + "epoch": 0.4001906082134812, + "grad_norm": 0.919375479221344, + "learning_rate": 1.1444265996086694e-06, + "loss": 0.5895, + "step": 27714 + }, + { + "epoch": 0.40063824871483855, + "grad_norm": 0.9435774087905884, + "learning_rate": 1.119442420708211e-06, + "loss": 0.5801, + "step": 27745 + }, + { + "epoch": 0.4010858892161959, + "grad_norm": 0.9303644299507141, + "learning_rate": 1.0947277306542964e-06, + "loss": 0.5887, + "step": 27776 + }, + { + "epoch": 0.4015335297175533, + "grad_norm": 0.9209316968917847, + "learning_rate": 1.0702828083488353e-06, + "loss": 0.5908, + "step": 27807 + }, + { + "epoch": 0.40198117021891067, + "grad_norm": 0.8827571272850037, + "learning_rate": 1.0461079296494647e-06, + "loss": 0.5846, + "step": 27838 + }, + { + "epoch": 0.402428810720268, + "grad_norm": 0.908743143081665, + "learning_rate": 1.0222033673663978e-06, + "loss": 0.5818, + "step": 27869 + }, + { + "epoch": 0.40287645122162535, + "grad_norm": 0.9340828061103821, + "learning_rate": 9.985693912593713e-07, + "loss": 0.5896, + "step": 27900 + }, + { + "epoch": 0.4033240917229827, + "grad_norm": 0.9275760054588318, + "learning_rate": 9.752062680346035e-07, + "loss": 0.5886, + "step": 27931 + }, + { + "epoch": 0.4037717322243401, + "grad_norm": 0.9111776947975159, + "learning_rate": 9.521142613417494e-07, + "loss": 0.5852, + "step": 27962 + }, + { + "epoch": 0.40421937272569747, + "grad_norm": 0.9355432987213135, + "learning_rate": 9.292936317709722e-07, + "loss": 0.5808, + "step": 27993 + }, + { + "epoch": 0.40466701322705484, + "grad_norm": 0.8610454201698303, + "learning_rate": 9.067446368499793e-07, + "loss": 0.5839, + "step": 28024 + }, + { + "epoch": 0.40511465372841216, + "grad_norm": 0.9698997735977173, + "learning_rate": 8.844675310411055e-07, + "loss": 0.5846, + "step": 28055 + }, + { + "epoch": 0.4055622942297695, + "grad_norm": 0.864654004573822, + "learning_rate": 8.6246256573847e-07, + "loss": 0.5768, + "step": 28086 + }, + { + "epoch": 0.4060099347311269, + "grad_norm": 0.9228002429008484, + "learning_rate": 8.407299892651127e-07, + "loss": 0.5801, + "step": 28117 + }, + { + "epoch": 0.40645757523248427, + "grad_norm": 0.9443821907043457, + "learning_rate": 8.19270046870202e-07, + "loss": 0.5826, + "step": 28148 + }, + { + "epoch": 0.40690521573384164, + "grad_norm": 0.8702989816665649, + "learning_rate": 7.980829807262752e-07, + "loss": 0.5804, + "step": 28179 + }, + { + "epoch": 0.40735285623519896, + "grad_norm": 0.922178328037262, + "learning_rate": 7.771690299264889e-07, + "loss": 0.5764, + "step": 28210 + }, + { + "epoch": 0.40780049673655633, + "grad_norm": 0.9058007001876831, + "learning_rate": 7.565284304819426e-07, + "loss": 0.5868, + "step": 28241 + }, + { + "epoch": 0.4082481372379137, + "grad_norm": 0.900319516658783, + "learning_rate": 7.361614153189922e-07, + "loss": 0.5792, + "step": 28272 + }, + { + "epoch": 0.40869577773927107, + "grad_norm": 0.9342830777168274, + "learning_rate": 7.160682142766328e-07, + "loss": 0.5865, + "step": 28303 + }, + { + "epoch": 0.40914341824062844, + "grad_norm": 0.9849163293838501, + "learning_rate": 6.962490541039091e-07, + "loss": 0.5864, + "step": 28334 + }, + { + "epoch": 0.4095910587419858, + "grad_norm": 0.9586670398712158, + "learning_rate": 6.767041584573531e-07, + "loss": 0.5846, + "step": 28365 + }, + { + "epoch": 0.41003869924334313, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.574337478984532e-07, + "loss": 0.5863, + "step": 28396 + }, + { + "epoch": 0.4104863397447005, + "grad_norm": 0.861381471157074, + "learning_rate": 6.384380398911732e-07, + "loss": 0.5829, + "step": 28427 + }, + { + "epoch": 0.4109339802460579, + "grad_norm": 0.8965010643005371, + "learning_rate": 6.197172487994951e-07, + "loss": 0.5804, + "step": 28458 + }, + { + "epoch": 0.41138162074741524, + "grad_norm": 0.8720385432243347, + "learning_rate": 6.012715858850021e-07, + "loss": 0.5824, + "step": 28489 + }, + { + "epoch": 0.4118292612487726, + "grad_norm": 0.9194157123565674, + "learning_rate": 5.831012593044971e-07, + "loss": 0.5829, + "step": 28520 + }, + { + "epoch": 0.41227690175013, + "grad_norm": 0.9252064228057861, + "learning_rate": 5.652064741076435e-07, + "loss": 0.5994, + "step": 28551 + }, + { + "epoch": 0.4127245422514873, + "grad_norm": 0.8969072103500366, + "learning_rate": 5.475874322346558e-07, + "loss": 0.5831, + "step": 28582 + }, + { + "epoch": 0.4131721827528447, + "grad_norm": 0.8892102241516113, + "learning_rate": 5.30244332514035e-07, + "loss": 0.5832, + "step": 28613 + }, + { + "epoch": 0.41361982325420205, + "grad_norm": 0.8727803826332092, + "learning_rate": 5.131773706602977e-07, + "loss": 0.5826, + "step": 28644 + }, + { + "epoch": 0.4140674637555594, + "grad_norm": 0.909462034702301, + "learning_rate": 4.963867392717897e-07, + "loss": 0.5776, + "step": 28675 + }, + { + "epoch": 0.4145151042569168, + "grad_norm": 0.9155588150024414, + "learning_rate": 4.798726278285093e-07, + "loss": 0.5869, + "step": 28706 + }, + { + "epoch": 0.4149627447582741, + "grad_norm": 0.8983290195465088, + "learning_rate": 4.6363522268995097e-07, + "loss": 0.5829, + "step": 28737 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.8464504480361938, + "learning_rate": 4.4767470709302927e-07, + "loss": 0.5724, + "step": 28768 + }, + { + "epoch": 0.41585802576098885, + "grad_norm": 0.8773616552352905, + "learning_rate": 4.319912611499971e-07, + "loss": 0.5818, + "step": 28799 + }, + { + "epoch": 0.4163056662623462, + "grad_norm": 0.9296445250511169, + "learning_rate": 4.1658506184640564e-07, + "loss": 0.5771, + "step": 28830 + }, + { + "epoch": 0.4167533067637036, + "grad_norm": 0.9054000973701477, + "learning_rate": 4.0145628303911996e-07, + "loss": 0.5854, + "step": 28861 + }, + { + "epoch": 0.41720094726506096, + "grad_norm": 0.8678483366966248, + "learning_rate": 3.866050954543565e-07, + "loss": 0.5802, + "step": 28892 + }, + { + "epoch": 0.4176485877664183, + "grad_norm": 0.8449427485466003, + "learning_rate": 3.720316666857432e-07, + "loss": 0.5808, + "step": 28923 + }, + { + "epoch": 0.41809622826777565, + "grad_norm": 1.0177295207977295, + "learning_rate": 3.5773616119244845e-07, + "loss": 0.58, + "step": 28954 + }, + { + "epoch": 0.418543868769133, + "grad_norm": 0.9004929065704346, + "learning_rate": 3.437187402973052e-07, + "loss": 0.5885, + "step": 28985 + }, + { + "epoch": 0.4189915092704904, + "grad_norm": 0.8589534163475037, + "learning_rate": 3.2997956218500104e-07, + "loss": 0.577, + "step": 29016 + }, + { + "epoch": 0.41943914977184776, + "grad_norm": 0.8590395450592041, + "learning_rate": 3.165187819003018e-07, + "loss": 0.583, + "step": 29047 + }, + { + "epoch": 0.4198867902732051, + "grad_norm": 0.8661704063415527, + "learning_rate": 3.033365513462755e-07, + "loss": 0.5921, + "step": 29078 + }, + { + "epoch": 0.42033443077456245, + "grad_norm": 0.9449494481086731, + "learning_rate": 2.9043301928260437e-07, + "loss": 0.5846, + "step": 29109 + }, + { + "epoch": 0.4207820712759198, + "grad_norm": 0.8835760951042175, + "learning_rate": 2.7780833132389773e-07, + "loss": 0.5841, + "step": 29140 + }, + { + "epoch": 0.4212297117772772, + "grad_norm": 0.8761197328567505, + "learning_rate": 2.6546262993803473e-07, + "loss": 0.5855, + "step": 29171 + }, + { + "epoch": 0.42167735227863457, + "grad_norm": 0.9937579035758972, + "learning_rate": 2.533960544445879e-07, + "loss": 0.58, + "step": 29202 + }, + { + "epoch": 0.42212499277999194, + "grad_norm": 0.830741822719574, + "learning_rate": 2.416087410132134e-07, + "loss": 0.5828, + "step": 29233 + }, + { + "epoch": 0.42257263328134925, + "grad_norm": 0.9345104098320007, + "learning_rate": 2.301008226621465e-07, + "loss": 0.5838, + "step": 29264 + }, + { + "epoch": 0.4230202737827066, + "grad_norm": 0.8927823305130005, + "learning_rate": 2.1887242925668073e-07, + "loss": 0.5823, + "step": 29295 + }, + { + "epoch": 0.423467914284064, + "grad_norm": 0.9171820282936096, + "learning_rate": 2.0792368750770785e-07, + "loss": 0.5884, + "step": 29326 + }, + { + "epoch": 0.42391555478542137, + "grad_norm": 0.9589295387268066, + "learning_rate": 1.9725472097028851e-07, + "loss": 0.5855, + "step": 29357 + }, + { + "epoch": 0.42436319528677874, + "grad_norm": 0.9396247267723083, + "learning_rate": 1.8686565004226718e-07, + "loss": 0.5898, + "step": 29388 + }, + { + "epoch": 0.4248108357881361, + "grad_norm": 0.8725122809410095, + "learning_rate": 1.7675659196288995e-07, + "loss": 0.5878, + "step": 29419 + }, + { + "epoch": 0.4252584762894934, + "grad_norm": 0.8159589767456055, + "learning_rate": 1.6692766081150556e-07, + "loss": 0.5793, + "step": 29450 + }, + { + "epoch": 0.4257061167908508, + "grad_norm": 0.9211475253105164, + "learning_rate": 1.5737896750626647e-07, + "loss": 0.5823, + "step": 29481 + }, + { + "epoch": 0.42615375729220817, + "grad_norm": 0.8058143258094788, + "learning_rate": 1.4811061980287976e-07, + "loss": 0.5828, + "step": 29512 + }, + { + "epoch": 0.42660139779356554, + "grad_norm": 1.0279886722564697, + "learning_rate": 1.3912272229338886e-07, + "loss": 0.5832, + "step": 29543 + }, + { + "epoch": 0.4270490382949229, + "grad_norm": 0.9076089262962341, + "learning_rate": 1.3041537640499645e-07, + "loss": 0.5802, + "step": 29574 + }, + { + "epoch": 0.4274966787962802, + "grad_norm": 0.9607642889022827, + "learning_rate": 1.2198868039891564e-07, + "loss": 0.5901, + "step": 29605 + }, + { + "epoch": 0.4279443192976376, + "grad_norm": 0.9149357080459595, + "learning_rate": 1.138427293692651e-07, + "loss": 0.5801, + "step": 29636 + }, + { + "epoch": 0.42839195979899497, + "grad_norm": 0.8789470791816711, + "learning_rate": 1.0597761524199778e-07, + "loss": 0.5876, + "step": 29667 + }, + { + "epoch": 0.42883960030035234, + "grad_norm": 0.8680891394615173, + "learning_rate": 9.839342677385455e-08, + "loss": 0.5871, + "step": 29698 + }, + { + "epoch": 0.4292872408017097, + "grad_norm": 0.9469189643859863, + "learning_rate": 9.109024955137325e-08, + "loss": 0.5806, + "step": 29729 + }, + { + "epoch": 0.4297348813030671, + "grad_norm": 0.8555542230606079, + "learning_rate": 8.406816598991729e-08, + "loss": 0.5878, + "step": 29760 + }, + { + "epoch": 0.4301825218044244, + "grad_norm": 0.8864733576774597, + "learning_rate": 7.73272553327431e-08, + "loss": 0.5814, + "step": 29791 + }, + { + "epoch": 0.43063016230578177, + "grad_norm": 0.9179209470748901, + "learning_rate": 7.086759365011186e-08, + "loss": 0.5842, + "step": 29822 + }, + { + "epoch": 0.43107780280713914, + "grad_norm": 0.952071487903595, + "learning_rate": 6.468925383842639e-08, + "loss": 0.5806, + "step": 29853 + }, + { + "epoch": 0.4315254433084965, + "grad_norm": 0.9271907806396484, + "learning_rate": 5.8792305619415067e-08, + "loss": 0.5857, + "step": 29884 + }, + { + "epoch": 0.4319730838098539, + "grad_norm": 0.9237009286880493, + "learning_rate": 5.317681553933529e-08, + "loss": 0.5714, + "step": 29915 + }, + { + "epoch": 0.4324207243112112, + "grad_norm": 0.9051603078842163, + "learning_rate": 4.78428469682296e-08, + "loss": 0.5876, + "step": 29946 + }, + { + "epoch": 0.4328683648125686, + "grad_norm": 0.8672299385070801, + "learning_rate": 4.2790460099206844e-08, + "loss": 0.5837, + "step": 29977 + }, + { + "epoch": 0.43331600531392594, + "grad_norm": 0.9098496437072754, + "learning_rate": 3.801971194777043e-08, + "loss": 0.5865, + "step": 30008 + }, + { + "epoch": 0.4337636458152833, + "grad_norm": 0.838880717754364, + "learning_rate": 3.353065635115782e-08, + "loss": 0.589, + "step": 30039 + }, + { + "epoch": 0.4342112863166407, + "grad_norm": 0.8752391338348389, + "learning_rate": 2.93233439677576e-08, + "loss": 0.5921, + "step": 30070 + }, + { + "epoch": 0.43465892681799806, + "grad_norm": 0.8784201741218567, + "learning_rate": 2.539782227651555e-08, + "loss": 0.5842, + "step": 30101 + }, + { + "epoch": 0.4351065673193554, + "grad_norm": 0.8383466601371765, + "learning_rate": 2.175413557641004e-08, + "loss": 0.58, + "step": 30132 + }, + { + "epoch": 0.43555420782071275, + "grad_norm": 0.8927497267723083, + "learning_rate": 1.839232498594967e-08, + "loss": 0.5817, + "step": 30163 + }, + { + "epoch": 0.4360018483220701, + "grad_norm": 0.8814988136291504, + "learning_rate": 1.5312428442712522e-08, + "loss": 0.5815, + "step": 30194 + }, + { + "epoch": 0.4364494888234275, + "grad_norm": 0.8772262930870056, + "learning_rate": 1.2514480702913168e-08, + "loss": 0.5828, + "step": 30225 + }, + { + "epoch": 0.43689712932478486, + "grad_norm": 0.9208593368530273, + "learning_rate": 9.998513341005766e-09, + "loss": 0.5832, + "step": 30256 + }, + { + "epoch": 0.43734476982614223, + "grad_norm": 0.9014251828193665, + "learning_rate": 7.764554749345454e-09, + "loss": 0.5859, + "step": 30287 + }, + { + "epoch": 0.43779241032749955, + "grad_norm": 0.8643161654472351, + "learning_rate": 5.812630137849717e-09, + "loss": 0.5818, + "step": 30318 + }, + { + "epoch": 0.4382400508288569, + "grad_norm": 0.9332028031349182, + "learning_rate": 4.142761533723616e-09, + "loss": 0.5824, + "step": 30349 + }, + { + "epoch": 0.4386876913302143, + "grad_norm": 0.937886118888855, + "learning_rate": 2.7549677812044317e-09, + "loss": 0.5827, + "step": 30380 + }, + { + "epoch": 0.43913533183157166, + "grad_norm": 0.8592664003372192, + "learning_rate": 1.6492645413590525e-09, + "loss": 0.5874, + "step": 30411 + }, + { + "epoch": 0.43958297233292903, + "grad_norm": 0.9078693985939026, + "learning_rate": 8.256642918980096e-10, + "loss": 0.5887, + "step": 30442 + }, + { + "epoch": 0.44003061283428635, + "grad_norm": 0.9738094210624695, + "learning_rate": 2.841763270367004e-10, + "loss": 0.5885, + "step": 30473 + }, + { + "epoch": 0.4404782533356437, + "grad_norm": 0.8958234786987305, + "learning_rate": 2.480675739269245e-11, + "loss": 0.588, + "step": 30504 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 7630, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.251434749612104e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-30517/training_args.bin b/checkpoint-30517/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542 --- /dev/null +++ b/checkpoint-30517/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3 +size 5432 diff --git a/checkpoint-7630/config.json b/checkpoint-7630/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09 --- /dev/null +++ b/checkpoint-7630/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.47.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-7630/generation_config.json b/checkpoint-7630/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507 --- /dev/null +++ b/checkpoint-7630/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.47.0.dev0" +} diff --git a/checkpoint-7630/model-00001-of-00007.safetensors b/checkpoint-7630/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce7f182e82e329320bcb36b04cf7c74d588805c1 --- /dev/null +++ b/checkpoint-7630/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1304ccb28c7baf2e11d4151ac4ce39ec5e74ac35fe1bb64ae0c3a4fc5feae30 +size 4886466168 diff --git a/checkpoint-7630/model-00002-of-00007.safetensors b/checkpoint-7630/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961 --- /dev/null +++ b/checkpoint-7630/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64 +size 4832007448 diff --git a/checkpoint-7630/model-00003-of-00007.safetensors b/checkpoint-7630/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff --- /dev/null +++ b/checkpoint-7630/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97 +size 4999813112 diff --git a/checkpoint-7630/model-00004-of-00007.safetensors b/checkpoint-7630/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a --- /dev/null +++ b/checkpoint-7630/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042 +size 4999813128 diff --git a/checkpoint-7630/model-00005-of-00007.safetensors b/checkpoint-7630/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89 --- /dev/null +++ b/checkpoint-7630/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7 +size 4832007496 diff --git a/checkpoint-7630/model-00006-of-00007.safetensors b/checkpoint-7630/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c02f856c6d61d591503cb8f20e2f86b84e85cbd --- /dev/null +++ b/checkpoint-7630/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ca1ae72980a5b1170f14c77aa114b6676a47644ca1ee08bb557eada38076fd1 +size 4999813120 diff --git a/checkpoint-7630/model-00007-of-00007.safetensors b/checkpoint-7630/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5af7b9622701e0303b6aa879010313980169acc3 --- /dev/null +++ b/checkpoint-7630/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2478f25f3b8db07780b95525b898a1239e19e98136f2aede1f221e3f596bb730 +size 2571158184 diff --git a/checkpoint-7630/model.safetensors.index.json b/checkpoint-7630/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-7630/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-7630/optimizer.pt b/checkpoint-7630/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..81e8e7b7562be962bdc3fe6511d7c45a77a7d64d --- /dev/null +++ b/checkpoint-7630/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e443304a650a93fae8c01bee8207cb57a93b585bccc7159711506c1f7eae0751 +size 15385036334 diff --git a/checkpoint-7630/rng_state.pth b/checkpoint-7630/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c --- /dev/null +++ b/checkpoint-7630/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69 +size 14244 diff --git a/checkpoint-7630/scheduler.pt b/checkpoint-7630/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0694ebf59305ba0ae1a85e2530a75a5934817c99 --- /dev/null +++ b/checkpoint-7630/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534e20fd2d9d5b7f7b2cacb37e2f2890c1ef6152d939158d158e395d1f32997e +size 1064 diff --git a/checkpoint-7630/trainer_state.json b/checkpoint-7630/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c496fd173565158d577ec420f3d8d947ee3f8357 --- /dev/null +++ b/checkpoint-7630/trainer_state.json @@ -0,0 +1,1755 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1101773233986022, + "eval_steps": 500, + "global_step": 7630, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004476405013573615, + "grad_norm": 4.6696085929870605, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9366, + "step": 31 + }, + { + "epoch": 0.000895281002714723, + "grad_norm": 4.250915050506592, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.9002, + "step": 62 + }, + { + "epoch": 0.0013429215040720846, + "grad_norm": 4.424270153045654, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8843, + "step": 93 + }, + { + "epoch": 0.001790562005429446, + "grad_norm": 4.56964635848999, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8717, + "step": 124 + }, + { + "epoch": 0.0022382025067868077, + "grad_norm": 4.051624298095703, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8711, + "step": 155 + }, + { + "epoch": 0.002685843008144169, + "grad_norm": 3.98006272315979, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8628, + "step": 186 + }, + { + "epoch": 0.0031334835095015307, + "grad_norm": 4.4158406257629395, + "learning_rate": 7.110091743119267e-06, + "loss": 0.871, + "step": 217 + }, + { + "epoch": 0.003581124010858892, + "grad_norm": 4.681333541870117, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8593, + "step": 248 + }, + { + "epoch": 0.004028764512216254, + "grad_norm": 3.8057820796966553, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8558, + "step": 279 + }, + { + "epoch": 0.0044764050135736155, + "grad_norm": 4.523633003234863, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8676, + "step": 310 + }, + { + "epoch": 0.0049240455149309765, + "grad_norm": 3.7387187480926514, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8585, + "step": 341 + }, + { + "epoch": 0.005371686016288338, + "grad_norm": 4.187750816345215, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8592, + "step": 372 + }, + { + "epoch": 0.005819326517645699, + "grad_norm": 3.782883644104004, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8449, + "step": 403 + }, + { + "epoch": 0.006266967019003061, + "grad_norm": 3.577796459197998, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8418, + "step": 434 + }, + { + "epoch": 0.006714607520360423, + "grad_norm": 3.1408321857452393, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8577, + "step": 465 + }, + { + "epoch": 0.007162248021717784, + "grad_norm": 4.090081691741943, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8439, + "step": 496 + }, + { + "epoch": 0.007609888523075146, + "grad_norm": 2.7458200454711914, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8468, + "step": 527 + }, + { + "epoch": 0.008057529024432507, + "grad_norm": 3.703225612640381, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8385, + "step": 558 + }, + { + "epoch": 0.008505169525789868, + "grad_norm": 3.134650230407715, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8418, + "step": 589 + }, + { + "epoch": 0.008952810027147231, + "grad_norm": 3.762680768966675, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8312, + "step": 620 + }, + { + "epoch": 0.009400450528504592, + "grad_norm": 3.751004457473755, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8251, + "step": 651 + }, + { + "epoch": 0.009848091029861953, + "grad_norm": 3.2268712520599365, + "learning_rate": 2.234600262123198e-05, + "loss": 0.8369, + "step": 682 + }, + { + "epoch": 0.010295731531219316, + "grad_norm": 3.5854289531707764, + "learning_rate": 2.336173001310616e-05, + "loss": 0.826, + "step": 713 + }, + { + "epoch": 0.010743372032576677, + "grad_norm": 3.9910435676574707, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8168, + "step": 744 + }, + { + "epoch": 0.011191012533934038, + "grad_norm": 3.3059303760528564, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.011638653035291399, + "grad_norm": 3.4081811904907227, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.817, + "step": 806 + }, + { + "epoch": 0.012086293536648762, + "grad_norm": 3.2740163803100586, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8195, + "step": 837 + }, + { + "epoch": 0.012533934038006123, + "grad_norm": 2.7206223011016846, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8188, + "step": 868 + }, + { + "epoch": 0.012981574539363484, + "grad_norm": 2.7005629539489746, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8127, + "step": 899 + }, + { + "epoch": 0.013429215040720846, + "grad_norm": 2.970745325088501, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8126, + "step": 930 + }, + { + "epoch": 0.013876855542078207, + "grad_norm": 2.4761953353881836, + "learning_rate": 3.148754914809961e-05, + "loss": 0.82, + "step": 961 + }, + { + "epoch": 0.014324496043435568, + "grad_norm": 2.8555397987365723, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8166, + "step": 992 + }, + { + "epoch": 0.01477213654479293, + "grad_norm": 2.8124194145202637, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8057, + "step": 1023 + }, + { + "epoch": 0.015219777046150292, + "grad_norm": 2.353851556777954, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8064, + "step": 1054 + }, + { + "epoch": 0.015667417547507653, + "grad_norm": 3.0127620697021484, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8086, + "step": 1085 + }, + { + "epoch": 0.016115058048865014, + "grad_norm": 2.792686939239502, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.8152, + "step": 1116 + }, + { + "epoch": 0.016562698550222375, + "grad_norm": 2.407134532928467, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7949, + "step": 1147 + }, + { + "epoch": 0.017010339051579736, + "grad_norm": 2.6921393871307373, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.804, + "step": 1178 + }, + { + "epoch": 0.0174579795529371, + "grad_norm": 2.3015975952148438, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7944, + "step": 1209 + }, + { + "epoch": 0.017905620054294462, + "grad_norm": 2.8116579055786133, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7977, + "step": 1240 + }, + { + "epoch": 0.018353260555651823, + "grad_norm": 2.5720036029815674, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7854, + "step": 1271 + }, + { + "epoch": 0.018800901057009184, + "grad_norm": 2.0802650451660156, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7892, + "step": 1302 + }, + { + "epoch": 0.019248541558366545, + "grad_norm": 2.4343624114990234, + "learning_rate": 4.367627785058978e-05, + "loss": 0.7897, + "step": 1333 + }, + { + "epoch": 0.019696182059723906, + "grad_norm": 2.509686231613159, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7855, + "step": 1364 + }, + { + "epoch": 0.020143822561081267, + "grad_norm": 2.626512289047241, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7873, + "step": 1395 + }, + { + "epoch": 0.02059146306243863, + "grad_norm": 2.8619399070739746, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7891, + "step": 1426 + }, + { + "epoch": 0.021039103563795993, + "grad_norm": 2.724792718887329, + "learning_rate": 4.77391874180865e-05, + "loss": 0.782, + "step": 1457 + }, + { + "epoch": 0.021486744065153354, + "grad_norm": 2.6659562587738037, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7856, + "step": 1488 + }, + { + "epoch": 0.021934384566510715, + "grad_norm": 2.646078586578369, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7748, + "step": 1519 + }, + { + "epoch": 0.022382025067868076, + "grad_norm": 2.429288387298584, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7722, + "step": 1550 + }, + { + "epoch": 0.022829665569225437, + "grad_norm": 1.9933409690856934, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7874, + "step": 1581 + }, + { + "epoch": 0.023277306070582798, + "grad_norm": 2.314889907836914, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7757, + "step": 1612 + }, + { + "epoch": 0.023724946571940162, + "grad_norm": 2.2891199588775635, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7713, + "step": 1643 + }, + { + "epoch": 0.024172587073297523, + "grad_norm": 2.4892444610595703, + "learning_rate": 4.999678487776908e-05, + "loss": 0.7687, + "step": 1674 + }, + { + "epoch": 0.024620227574654884, + "grad_norm": 2.3015685081481934, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7721, + "step": 1705 + }, + { + "epoch": 0.025067868076012245, + "grad_norm": 2.278954029083252, + "learning_rate": 4.999352703566763e-05, + "loss": 0.7741, + "step": 1736 + }, + { + "epoch": 0.025515508577369606, + "grad_norm": 1.7260370254516602, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7681, + "step": 1767 + }, + { + "epoch": 0.025963149078726967, + "grad_norm": 2.0179309844970703, + "learning_rate": 4.998914100252672e-05, + "loss": 0.7604, + "step": 1798 + }, + { + "epoch": 0.02641078958008433, + "grad_norm": 2.53022837638855, + "learning_rate": 4.998652497419696e-05, + "loss": 0.7598, + "step": 1829 + }, + { + "epoch": 0.026858430081441693, + "grad_norm": 1.859253168106079, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7606, + "step": 1860 + }, + { + "epoch": 0.027306070582799054, + "grad_norm": 1.759303331375122, + "learning_rate": 4.998044704162613e-05, + "loss": 0.7532, + "step": 1891 + }, + { + "epoch": 0.027753711084156415, + "grad_norm": 2.4389419555664062, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.7646, + "step": 1922 + }, + { + "epoch": 0.028201351585513776, + "grad_norm": 2.155348777770996, + "learning_rate": 4.997324150843799e-05, + "loss": 0.7569, + "step": 1953 + }, + { + "epoch": 0.028648992086871137, + "grad_norm": 2.0138537883758545, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7677, + "step": 1984 + }, + { + "epoch": 0.029096632588228498, + "grad_norm": 2.5275282859802246, + "learning_rate": 4.996490869988546e-05, + "loss": 0.7519, + "step": 2015 + }, + { + "epoch": 0.02954427308958586, + "grad_norm": 1.8147333860397339, + "learning_rate": 4.996031968290326e-05, + "loss": 0.7509, + "step": 2046 + }, + { + "epoch": 0.029991913590943223, + "grad_norm": 2.1941769123077393, + "learning_rate": 4.995544899210594e-05, + "loss": 0.754, + "step": 2077 + }, + { + "epoch": 0.030439554092300584, + "grad_norm": 1.8953059911727905, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.747, + "step": 2108 + }, + { + "epoch": 0.030887194593657945, + "grad_norm": 3.3973031044006348, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7513, + "step": 2139 + }, + { + "epoch": 0.031334835095015307, + "grad_norm": 2.66795015335083, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.7469, + "step": 2170 + }, + { + "epoch": 0.03178247559637267, + "grad_norm": 1.6254230737686157, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.7297, + "step": 2201 + }, + { + "epoch": 0.03223011609773003, + "grad_norm": 1.822745680809021, + "learning_rate": 4.992687246588743e-05, + "loss": 0.754, + "step": 2232 + }, + { + "epoch": 0.03267775659908739, + "grad_norm": 1.6898781061172485, + "learning_rate": 4.992031299767347e-05, + "loss": 0.7478, + "step": 2263 + }, + { + "epoch": 0.03312539710044475, + "grad_norm": 1.799280047416687, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.746, + "step": 2294 + }, + { + "epoch": 0.033573037601802115, + "grad_norm": 2.2501840591430664, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.7408, + "step": 2325 + }, + { + "epoch": 0.03402067810315947, + "grad_norm": 2.3315324783325195, + "learning_rate": 4.989894757091861e-05, + "loss": 0.7301, + "step": 2356 + }, + { + "epoch": 0.03446831860451684, + "grad_norm": 1.5820438861846924, + "learning_rate": 4.989126368919158e-05, + "loss": 0.7305, + "step": 2387 + }, + { + "epoch": 0.0349159591058742, + "grad_norm": 2.5696022510528564, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7337, + "step": 2418 + }, + { + "epoch": 0.03536359960723156, + "grad_norm": 1.8880938291549683, + "learning_rate": 4.987505333203608e-05, + "loss": 0.7385, + "step": 2449 + }, + { + "epoch": 0.035811240108588924, + "grad_norm": 2.6148738861083984, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.7292, + "step": 2480 + }, + { + "epoch": 0.03625888060994628, + "grad_norm": 1.6925290822982788, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.7344, + "step": 2511 + }, + { + "epoch": 0.036706521111303646, + "grad_norm": 1.7675210237503052, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.7354, + "step": 2542 + }, + { + "epoch": 0.037154161612661, + "grad_norm": 2.1053173542022705, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.7272, + "step": 2573 + }, + { + "epoch": 0.03760180211401837, + "grad_norm": 1.9718347787857056, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.7336, + "step": 2604 + }, + { + "epoch": 0.03804944261537573, + "grad_norm": 1.5777671337127686, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7182, + "step": 2635 + }, + { + "epoch": 0.03849708311673309, + "grad_norm": 1.905127763748169, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7296, + "step": 2666 + }, + { + "epoch": 0.038944723618090454, + "grad_norm": 1.63962721824646, + "learning_rate": 4.979899154855234e-05, + "loss": 0.7249, + "step": 2697 + }, + { + "epoch": 0.03939236411944781, + "grad_norm": 1.584331750869751, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.7345, + "step": 2728 + }, + { + "epoch": 0.039840004620805176, + "grad_norm": 1.9111014604568481, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7225, + "step": 2759 + }, + { + "epoch": 0.040287645122162534, + "grad_norm": 2.3216073513031006, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7207, + "step": 2790 + }, + { + "epoch": 0.0407352856235199, + "grad_norm": 1.6002410650253296, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7228, + "step": 2821 + }, + { + "epoch": 0.04118292612487726, + "grad_norm": 2.104731798171997, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7127, + "step": 2852 + }, + { + "epoch": 0.04163056662623462, + "grad_norm": 1.7114660739898682, + "learning_rate": 4.973018858007122e-05, + "loss": 0.7283, + "step": 2883 + }, + { + "epoch": 0.042078207127591985, + "grad_norm": 1.948133945465088, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7209, + "step": 2914 + }, + { + "epoch": 0.04252584762894934, + "grad_norm": 1.621764898300171, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7272, + "step": 2945 + }, + { + "epoch": 0.04297348813030671, + "grad_norm": 1.6967558860778809, + "learning_rate": 4.969201777632205e-05, + "loss": 0.7191, + "step": 2976 + }, + { + "epoch": 0.043421128631664065, + "grad_norm": 1.6656996011734009, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7205, + "step": 3007 + }, + { + "epoch": 0.04386876913302143, + "grad_norm": 2.151475191116333, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7168, + "step": 3038 + }, + { + "epoch": 0.044316409634378794, + "grad_norm": 2.213109016418457, + "learning_rate": 4.965133917685858e-05, + "loss": 0.7139, + "step": 3069 + }, + { + "epoch": 0.04476405013573615, + "grad_norm": 1.5380377769470215, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7237, + "step": 3100 + }, + { + "epoch": 0.045211690637093516, + "grad_norm": 2.312377452850342, + "learning_rate": 4.962282892045718e-05, + "loss": 0.7156, + "step": 3131 + }, + { + "epoch": 0.04565933113845087, + "grad_norm": 1.7220717668533325, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.7122, + "step": 3162 + }, + { + "epoch": 0.04610697163980824, + "grad_norm": 1.802856206893921, + "learning_rate": 4.959320720608049e-05, + "loss": 0.7128, + "step": 3193 + }, + { + "epoch": 0.046554612141165595, + "grad_norm": 1.6629964113235474, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.7172, + "step": 3224 + }, + { + "epoch": 0.04700225264252296, + "grad_norm": 3.440115213394165, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7213, + "step": 3255 + }, + { + "epoch": 0.047449893143880324, + "grad_norm": 1.5721139907836914, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7148, + "step": 3286 + }, + { + "epoch": 0.04789753364523768, + "grad_norm": 2.0920398235321045, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7145, + "step": 3317 + }, + { + "epoch": 0.048345174146595046, + "grad_norm": 1.666566014289856, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7095, + "step": 3348 + }, + { + "epoch": 0.048792814647952404, + "grad_norm": 1.8222129344940186, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7138, + "step": 3379 + }, + { + "epoch": 0.04924045514930977, + "grad_norm": 1.7302964925765991, + "learning_rate": 4.948079823064559e-05, + "loss": 0.7017, + "step": 3410 + }, + { + "epoch": 0.049688095650667126, + "grad_norm": 1.7338463068008423, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6979, + "step": 3441 + }, + { + "epoch": 0.05013573615202449, + "grad_norm": 1.5637450218200684, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7011, + "step": 3472 + }, + { + "epoch": 0.050583376653381855, + "grad_norm": 1.5632222890853882, + "learning_rate": 4.942847531574167e-05, + "loss": 0.704, + "step": 3503 + }, + { + "epoch": 0.05103101715473921, + "grad_norm": 1.588402509689331, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7011, + "step": 3534 + }, + { + "epoch": 0.05147865765609658, + "grad_norm": 1.8840582370758057, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7016, + "step": 3565 + }, + { + "epoch": 0.051926298157453935, + "grad_norm": 1.2702268362045288, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7004, + "step": 3596 + }, + { + "epoch": 0.0523739386588113, + "grad_norm": 1.3812692165374756, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7046, + "step": 3627 + }, + { + "epoch": 0.05282157916016866, + "grad_norm": 1.7257345914840698, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6976, + "step": 3658 + }, + { + "epoch": 0.05326921966152602, + "grad_norm": 1.7458925247192383, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6997, + "step": 3689 + }, + { + "epoch": 0.053716860162883386, + "grad_norm": 2.1996099948883057, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6991, + "step": 3720 + }, + { + "epoch": 0.05416450066424074, + "grad_norm": 1.6615021228790283, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7005, + "step": 3751 + }, + { + "epoch": 0.05461214116559811, + "grad_norm": 1.6276952028274536, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7097, + "step": 3782 + }, + { + "epoch": 0.055059781666955465, + "grad_norm": 1.758227825164795, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6955, + "step": 3813 + }, + { + "epoch": 0.05550742216831283, + "grad_norm": 1.195280909538269, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7073, + "step": 3844 + }, + { + "epoch": 0.05595506266967019, + "grad_norm": 1.6281015872955322, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7022, + "step": 3875 + }, + { + "epoch": 0.05640270317102755, + "grad_norm": 1.3543150424957275, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7036, + "step": 3906 + }, + { + "epoch": 0.056850343672384916, + "grad_norm": 2.16947078704834, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7007, + "step": 3937 + }, + { + "epoch": 0.057297984173742274, + "grad_norm": 1.324578881263733, + "learning_rate": 4.912976038673786e-05, + "loss": 0.6941, + "step": 3968 + }, + { + "epoch": 0.05774562467509964, + "grad_norm": 1.9811108112335205, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6908, + "step": 3999 + }, + { + "epoch": 0.058193265176456996, + "grad_norm": 1.2975554466247559, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6936, + "step": 4030 + }, + { + "epoch": 0.05864090567781436, + "grad_norm": 1.583282232284546, + "learning_rate": 4.906263980464644e-05, + "loss": 0.698, + "step": 4061 + }, + { + "epoch": 0.05908854617917172, + "grad_norm": 1.3532944917678833, + "learning_rate": 4.903972285033178e-05, + "loss": 0.7049, + "step": 4092 + }, + { + "epoch": 0.05953618668052908, + "grad_norm": 2.1245481967926025, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7016, + "step": 4123 + }, + { + "epoch": 0.05998382718188645, + "grad_norm": 1.6913797855377197, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6981, + "step": 4154 + }, + { + "epoch": 0.060431467683243804, + "grad_norm": 1.51249098777771, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6955, + "step": 4185 + }, + { + "epoch": 0.06087910818460117, + "grad_norm": 1.3880395889282227, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6934, + "step": 4216 + }, + { + "epoch": 0.061326748685958526, + "grad_norm": 1.6354159116744995, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6938, + "step": 4247 + }, + { + "epoch": 0.06177438918731589, + "grad_norm": 2.126742362976074, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7003, + "step": 4278 + }, + { + "epoch": 0.06222202968867325, + "grad_norm": 1.7903707027435303, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6885, + "step": 4309 + }, + { + "epoch": 0.06266967019003061, + "grad_norm": 1.537806510925293, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6892, + "step": 4340 + }, + { + "epoch": 0.06311731069138797, + "grad_norm": 1.6445434093475342, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6843, + "step": 4371 + }, + { + "epoch": 0.06356495119274534, + "grad_norm": 1.555373191833496, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6899, + "step": 4402 + }, + { + "epoch": 0.0640125916941027, + "grad_norm": 1.8370277881622314, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6813, + "step": 4433 + }, + { + "epoch": 0.06446023219546006, + "grad_norm": 1.3132514953613281, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6832, + "step": 4464 + }, + { + "epoch": 0.06490787269681741, + "grad_norm": 1.3186298608779907, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6879, + "step": 4495 + }, + { + "epoch": 0.06535551319817479, + "grad_norm": 1.4360268115997314, + "learning_rate": 4.869052379269719e-05, + "loss": 0.69, + "step": 4526 + }, + { + "epoch": 0.06580315369953214, + "grad_norm": 1.670765995979309, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6865, + "step": 4557 + }, + { + "epoch": 0.0662507942008895, + "grad_norm": 1.7548723220825195, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6852, + "step": 4588 + }, + { + "epoch": 0.06669843470224687, + "grad_norm": 1.5086426734924316, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6894, + "step": 4619 + }, + { + "epoch": 0.06714607520360423, + "grad_norm": 1.3140665292739868, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6812, + "step": 4650 + }, + { + "epoch": 0.06759371570496159, + "grad_norm": 1.4212454557418823, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6872, + "step": 4681 + }, + { + "epoch": 0.06804135620631895, + "grad_norm": 1.3034414052963257, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6901, + "step": 4712 + }, + { + "epoch": 0.06848899670767632, + "grad_norm": 1.3741438388824463, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6875, + "step": 4743 + }, + { + "epoch": 0.06893663720903367, + "grad_norm": 1.7262542247772217, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6868, + "step": 4774 + }, + { + "epoch": 0.06938427771039103, + "grad_norm": 1.3293650150299072, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6891, + "step": 4805 + }, + { + "epoch": 0.0698319182117484, + "grad_norm": 1.3448151350021362, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6765, + "step": 4836 + }, + { + "epoch": 0.07027955871310576, + "grad_norm": 2.961280584335327, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6939, + "step": 4867 + }, + { + "epoch": 0.07072719921446312, + "grad_norm": 1.8265361785888672, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.677, + "step": 4898 + }, + { + "epoch": 0.07117483971582048, + "grad_norm": 1.490349531173706, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6778, + "step": 4929 + }, + { + "epoch": 0.07162248021717785, + "grad_norm": 1.3669307231903076, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6746, + "step": 4960 + }, + { + "epoch": 0.0720701207185352, + "grad_norm": 1.3995884656906128, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6757, + "step": 4991 + }, + { + "epoch": 0.07251776121989256, + "grad_norm": 1.1217372417449951, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6832, + "step": 5022 + }, + { + "epoch": 0.07296540172124993, + "grad_norm": 1.2192097902297974, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6868, + "step": 5053 + }, + { + "epoch": 0.07341304222260729, + "grad_norm": 1.5045067071914673, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6793, + "step": 5084 + }, + { + "epoch": 0.07386068272396465, + "grad_norm": 1.7127234935760498, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6797, + "step": 5115 + }, + { + "epoch": 0.074308323225322, + "grad_norm": 1.846561312675476, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6767, + "step": 5146 + }, + { + "epoch": 0.07475596372667938, + "grad_norm": 1.3076797723770142, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6855, + "step": 5177 + }, + { + "epoch": 0.07520360422803674, + "grad_norm": 1.4170383214950562, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.681, + "step": 5208 + }, + { + "epoch": 0.0756512447293941, + "grad_norm": 1.2504942417144775, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6778, + "step": 5239 + }, + { + "epoch": 0.07609888523075146, + "grad_norm": 1.1522283554077148, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6749, + "step": 5270 + }, + { + "epoch": 0.07654652573210882, + "grad_norm": 1.6351525783538818, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6745, + "step": 5301 + }, + { + "epoch": 0.07699416623346618, + "grad_norm": 1.5093014240264893, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6747, + "step": 5332 + }, + { + "epoch": 0.07744180673482354, + "grad_norm": 1.362160563468933, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.673, + "step": 5363 + }, + { + "epoch": 0.07788944723618091, + "grad_norm": 1.2021727561950684, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6768, + "step": 5394 + }, + { + "epoch": 0.07833708773753827, + "grad_norm": 2.1543540954589844, + "learning_rate": 4.780153554146274e-05, + "loss": 0.672, + "step": 5425 + }, + { + "epoch": 0.07878472823889562, + "grad_norm": 1.882712721824646, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6926, + "step": 5456 + }, + { + "epoch": 0.079232368740253, + "grad_norm": 1.3975650072097778, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6747, + "step": 5487 + }, + { + "epoch": 0.07968000924161035, + "grad_norm": 1.3912913799285889, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6756, + "step": 5518 + }, + { + "epoch": 0.08012764974296771, + "grad_norm": 1.7227635383605957, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6694, + "step": 5549 + }, + { + "epoch": 0.08057529024432507, + "grad_norm": 1.3151129484176636, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6715, + "step": 5580 + }, + { + "epoch": 0.08102293074568244, + "grad_norm": 1.0972425937652588, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6661, + "step": 5611 + }, + { + "epoch": 0.0814705712470398, + "grad_norm": 1.2346575260162354, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.66, + "step": 5642 + }, + { + "epoch": 0.08191821174839715, + "grad_norm": 1.4536516666412354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6743, + "step": 5673 + }, + { + "epoch": 0.08236585224975453, + "grad_norm": 1.1361631155014038, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6791, + "step": 5704 + }, + { + "epoch": 0.08281349275111188, + "grad_norm": 1.2605111598968506, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6797, + "step": 5735 + }, + { + "epoch": 0.08326113325246924, + "grad_norm": 1.2355903387069702, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6642, + "step": 5766 + }, + { + "epoch": 0.0837087737538266, + "grad_norm": 1.6677048206329346, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6759, + "step": 5797 + }, + { + "epoch": 0.08415641425518397, + "grad_norm": 1.4781981706619263, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6707, + "step": 5828 + }, + { + "epoch": 0.08460405475654133, + "grad_norm": 1.138583779335022, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6636, + "step": 5859 + }, + { + "epoch": 0.08505169525789869, + "grad_norm": 1.529036283493042, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6705, + "step": 5890 + }, + { + "epoch": 0.08549933575925606, + "grad_norm": 1.3216760158538818, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6714, + "step": 5921 + }, + { + "epoch": 0.08594697626061341, + "grad_norm": 1.1820168495178223, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6757, + "step": 5952 + }, + { + "epoch": 0.08639461676197077, + "grad_norm": 1.393571138381958, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6672, + "step": 5983 + }, + { + "epoch": 0.08684225726332813, + "grad_norm": 1.4574682712554932, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6655, + "step": 6014 + }, + { + "epoch": 0.0872898977646855, + "grad_norm": 1.138645887374878, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6663, + "step": 6045 + }, + { + "epoch": 0.08773753826604286, + "grad_norm": 1.7602777481079102, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6695, + "step": 6076 + }, + { + "epoch": 0.08818517876740022, + "grad_norm": 1.2323459386825562, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6636, + "step": 6107 + }, + { + "epoch": 0.08863281926875759, + "grad_norm": 1.6881431341171265, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6653, + "step": 6138 + }, + { + "epoch": 0.08908045977011494, + "grad_norm": 1.391417384147644, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6636, + "step": 6169 + }, + { + "epoch": 0.0895281002714723, + "grad_norm": 1.3066257238388062, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6777, + "step": 6200 + }, + { + "epoch": 0.08997574077282966, + "grad_norm": 1.2001326084136963, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6683, + "step": 6231 + }, + { + "epoch": 0.09042338127418703, + "grad_norm": 1.4361172914505005, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6634, + "step": 6262 + }, + { + "epoch": 0.09087102177554439, + "grad_norm": 8.04520320892334, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6752, + "step": 6293 + }, + { + "epoch": 0.09131866227690175, + "grad_norm": 1.4874210357666016, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6733, + "step": 6324 + }, + { + "epoch": 0.09176630277825912, + "grad_norm": 1.234491229057312, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.668, + "step": 6355 + }, + { + "epoch": 0.09221394327961648, + "grad_norm": 1.2088687419891357, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6705, + "step": 6386 + }, + { + "epoch": 0.09266158378097383, + "grad_norm": 1.1937814950942993, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6853, + "step": 6417 + }, + { + "epoch": 0.09310922428233119, + "grad_norm": 1.5205374956130981, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.685, + "step": 6448 + }, + { + "epoch": 0.09355686478368856, + "grad_norm": 1.2221660614013672, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6622, + "step": 6479 + }, + { + "epoch": 0.09400450528504592, + "grad_norm": 1.0762608051300049, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.669, + "step": 6510 + }, + { + "epoch": 0.09445214578640328, + "grad_norm": 1.4416946172714233, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6674, + "step": 6541 + }, + { + "epoch": 0.09489978628776065, + "grad_norm": 1.559158205986023, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6681, + "step": 6572 + }, + { + "epoch": 0.095347426789118, + "grad_norm": 1.3833891153335571, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6681, + "step": 6603 + }, + { + "epoch": 0.09579506729047536, + "grad_norm": 1.6753300428390503, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6658, + "step": 6634 + }, + { + "epoch": 0.09624270779183272, + "grad_norm": 1.951198697090149, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6654, + "step": 6665 + }, + { + "epoch": 0.09669034829319009, + "grad_norm": 1.2356919050216675, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6651, + "step": 6696 + }, + { + "epoch": 0.09713798879454745, + "grad_norm": 1.2712593078613281, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6646, + "step": 6727 + }, + { + "epoch": 0.09758562929590481, + "grad_norm": 1.1935900449752808, + "learning_rate": 4.608840417313604e-05, + "loss": 0.674, + "step": 6758 + }, + { + "epoch": 0.09803326979726218, + "grad_norm": 1.1649430990219116, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6644, + "step": 6789 + }, + { + "epoch": 0.09848091029861954, + "grad_norm": 1.4281456470489502, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6673, + "step": 6820 + }, + { + "epoch": 0.0989285507999769, + "grad_norm": 1.3064521551132202, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6584, + "step": 6851 + }, + { + "epoch": 0.09937619130133425, + "grad_norm": 1.2546554803848267, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6612, + "step": 6882 + }, + { + "epoch": 0.09982383180269162, + "grad_norm": 1.1866974830627441, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6612, + "step": 6913 + }, + { + "epoch": 0.10027147230404898, + "grad_norm": 1.6166640520095825, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6551, + "step": 6944 + }, + { + "epoch": 0.10071911280540634, + "grad_norm": 1.5471700429916382, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6607, + "step": 6975 + }, + { + "epoch": 0.10116675330676371, + "grad_norm": 1.3361026048660278, + "learning_rate": 4.571999560773736e-05, + "loss": 0.666, + "step": 7006 + }, + { + "epoch": 0.10161439380812107, + "grad_norm": 1.2938140630722046, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6547, + "step": 7037 + }, + { + "epoch": 0.10206203430947842, + "grad_norm": 1.2688400745391846, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6624, + "step": 7068 + }, + { + "epoch": 0.10250967481083578, + "grad_norm": 1.6306285858154297, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6627, + "step": 7099 + }, + { + "epoch": 0.10295731531219315, + "grad_norm": 1.3346176147460938, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6577, + "step": 7130 + }, + { + "epoch": 0.10340495581355051, + "grad_norm": 1.0933984518051147, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6602, + "step": 7161 + }, + { + "epoch": 0.10385259631490787, + "grad_norm": 1.575859785079956, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6593, + "step": 7192 + }, + { + "epoch": 0.10430023681626524, + "grad_norm": 1.4265861511230469, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6564, + "step": 7223 + }, + { + "epoch": 0.1047478773176226, + "grad_norm": 1.737012267112732, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6593, + "step": 7254 + }, + { + "epoch": 0.10519551781897996, + "grad_norm": 1.0257115364074707, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6514, + "step": 7285 + }, + { + "epoch": 0.10564315832033731, + "grad_norm": 1.5043773651123047, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6635, + "step": 7316 + }, + { + "epoch": 0.10609079882169468, + "grad_norm": 1.5642234086990356, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6558, + "step": 7347 + }, + { + "epoch": 0.10653843932305204, + "grad_norm": 1.1821067333221436, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6686, + "step": 7378 + }, + { + "epoch": 0.1069860798244094, + "grad_norm": 1.5492759943008423, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6543, + "step": 7409 + }, + { + "epoch": 0.10743372032576677, + "grad_norm": 1.281914234161377, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6516, + "step": 7440 + }, + { + "epoch": 0.10788136082712413, + "grad_norm": 1.3318305015563965, + "learning_rate": 4.498911497712155e-05, + "loss": 0.656, + "step": 7471 + }, + { + "epoch": 0.10832900132848149, + "grad_norm": 1.3832449913024902, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6475, + "step": 7502 + }, + { + "epoch": 0.10877664182983884, + "grad_norm": 1.3547158241271973, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6594, + "step": 7533 + }, + { + "epoch": 0.10922428233119622, + "grad_norm": 1.4633681774139404, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6707, + "step": 7564 + }, + { + "epoch": 0.10967192283255357, + "grad_norm": 1.1781059503555298, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6626, + "step": 7595 + }, + { + "epoch": 0.11011956333391093, + "grad_norm": 1.4727883338928223, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.6535, + "step": 7626 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 7630, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.629140197116477e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7630/training_args.bin b/checkpoint-7630/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542 --- /dev/null +++ b/checkpoint-7630/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3 +size 5432 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09 --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.47.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.47.0.dev0" +} diff --git a/model-00001-of-00007.safetensors b/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..542f9d7381f168eb0b0a63a75a9adf93a5deee06 --- /dev/null +++ b/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13878f97ef55d85d9b352c717dba406c909afe1bae3e88a6a4424a428c0bccc6 +size 4886466168 diff --git a/model-00002-of-00007.safetensors b/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961 --- /dev/null +++ b/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64 +size 4832007448 diff --git a/model-00003-of-00007.safetensors b/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff --- /dev/null +++ b/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97 +size 4999813112 diff --git a/model-00004-of-00007.safetensors b/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a --- /dev/null +++ b/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042 +size 4999813128 diff --git a/model-00005-of-00007.safetensors b/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89 --- /dev/null +++ b/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7 +size 4832007496 diff --git a/model-00006-of-00007.safetensors b/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..524e69fde1febd3510b4725ea7cfc5103d79e8d1 --- /dev/null +++ b/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf01cd5fe74bfd002b701e409ad8e68a02b64c59f6cfb2302b9427953c464f3 +size 4999813120 diff --git a/model-00007-of-00007.safetensors b/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b73e0db4c4b0097276bbc78a364a5ff57002d272 --- /dev/null +++ b/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9dd6df4fc7f009aa39a3b019a55cb97a8509bb44690419b3255311faaf9e89b +size 2571158184 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0df36e82e06f1188c56f572211c39d7d52f1f46e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3 +size 5432