diff --git a/checkpoint-12208/config.json b/checkpoint-12208/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-12208/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-12208/generation_config.json b/checkpoint-12208/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-12208/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-12208/model-00001-of-00007.safetensors b/checkpoint-12208/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..017878aa85311455e4aa67f62376e822dbe6ea4b --- /dev/null +++ b/checkpoint-12208/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7aafe91b943225fdf31015a8c2d1e89599fba12c149feb5814336f5bf7297e6 +size 4886466168 diff --git a/checkpoint-12208/model-00002-of-00007.safetensors b/checkpoint-12208/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-12208/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-12208/model-00003-of-00007.safetensors b/checkpoint-12208/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-12208/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-12208/model-00004-of-00007.safetensors b/checkpoint-12208/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-12208/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-12208/model-00005-of-00007.safetensors b/checkpoint-12208/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-12208/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-12208/model-00006-of-00007.safetensors b/checkpoint-12208/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5fafaa6437425acb1ec64cd7069f26482fcc8140 --- /dev/null +++ b/checkpoint-12208/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46422acb709980d248e047821977a961813fdeb1677dcbfad82e069c7eccbe6 +size 4999813120 diff --git a/checkpoint-12208/model-00007-of-00007.safetensors b/checkpoint-12208/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c184e30efb693fbdf8db4125837c075579503455 --- /dev/null +++ b/checkpoint-12208/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63be41870fa1c0cf75d08e1ccd55f8e568566e71220cf22b58113a7f1a3f241c +size 2571158184 diff --git a/checkpoint-12208/model.safetensors.index.json b/checkpoint-12208/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-12208/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-12208/optimizer.pt b/checkpoint-12208/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7c20e305fc383a4145e87e85a57a569cc033685 --- /dev/null +++ b/checkpoint-12208/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:012978646538badb26719dd07cb3108bffb6ba505cf96ce3b5e1998904027ee8 +size 15385036334 diff --git a/checkpoint-12208/rng_state.pth b/checkpoint-12208/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-12208/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-12208/scheduler.pt b/checkpoint-12208/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3b066b7a770b71a74f026fa108a814ac17f832 --- /dev/null +++ b/checkpoint-12208/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe644ed33a3c4139223f0857a985127f3e6fbaa8c89fa14b57671b49ca52c21 +size 1064 diff --git a/checkpoint-12208/trainer_state.json b/checkpoint-12208/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..656b40a6b2fea2a1f45e0b0d93ebe1dc1f244072 --- /dev/null +++ b/checkpoint-12208/trainer_state.json @@ -0,0 +1,2784 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1620640330818783, + "eval_steps": 500, + "global_step": 12208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.006624315386364e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12208/training_args.bin b/checkpoint-12208/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-12208/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-15260/config.json b/checkpoint-15260/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-15260/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-15260/generation_config.json b/checkpoint-15260/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-15260/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-15260/model-00001-of-00007.safetensors b/checkpoint-15260/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c8b85c29c5127e36e7de9bb2cd1bffaa9da289f6 --- /dev/null +++ b/checkpoint-15260/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1041ad7bc3e2d487dbcb5abb5d2444828ba8b7af4227311ded8ea9c17d5f85e0 +size 4886466168 diff --git a/checkpoint-15260/model-00002-of-00007.safetensors b/checkpoint-15260/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-15260/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-15260/model-00003-of-00007.safetensors b/checkpoint-15260/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-15260/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-15260/model-00004-of-00007.safetensors b/checkpoint-15260/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-15260/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-15260/model-00005-of-00007.safetensors b/checkpoint-15260/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-15260/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-15260/model-00006-of-00007.safetensors b/checkpoint-15260/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f527483cea747ab354097a3c5258cd186a9e9059 --- /dev/null +++ b/checkpoint-15260/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62e6de3c01ef59365296799084663b39e1a90e5d9635d63077f3baf2bdff970c +size 4999813120 diff --git a/checkpoint-15260/model-00007-of-00007.safetensors b/checkpoint-15260/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5364f87fdcec93d6dba34006c064a98108d68e7b --- /dev/null +++ b/checkpoint-15260/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b48b79e714fe0c8cd276f8bce7e9e22aed9445c9d775ae7ba28e76bfd00b4549 +size 2571158184 diff --git a/checkpoint-15260/model.safetensors.index.json b/checkpoint-15260/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-15260/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-15260/optimizer.pt b/checkpoint-15260/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da04160eb91aa1aa16267a96646e9e65af81657d --- /dev/null +++ b/checkpoint-15260/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b189fc7ec92c48e8b419723e368a977a52262356ce9fad323467adc7754440d6 +size 15385036334 diff --git a/checkpoint-15260/rng_state.pth b/checkpoint-15260/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-15260/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-15260/scheduler.pt b/checkpoint-15260/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f65fadce527ecb01fb7d7fd7ef901b9b082813 --- /dev/null +++ b/checkpoint-15260/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce335347efaebf038b8915bc0a8d2d587a4a1aa08ad0e015b4bc7cc4fba634e +size 1064 diff --git a/checkpoint-15260/trainer_state.json b/checkpoint-15260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e205ce0fc7c1706bec1e49ae8be47eadbb9e3b09 --- /dev/null +++ b/checkpoint-15260/trainer_state.json @@ -0,0 +1,3477 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2025800413523479, + "eval_steps": 500, + "global_step": 15260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + }, + { + "epoch": 0.1621436844742842, + "grad_norm": 0.952920138835907, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5843, + "step": 12214 + }, + { + "epoch": 0.16255521666838138, + "grad_norm": 0.9184344410896301, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5748, + "step": 12245 + }, + { + "epoch": 0.16296674886247856, + "grad_norm": 0.9797667264938354, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5788, + "step": 12276 + }, + { + "epoch": 0.16337828105657573, + "grad_norm": 0.968781054019928, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5835, + "step": 12307 + }, + { + "epoch": 0.16378981325067288, + "grad_norm": 1.0632243156433105, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5742, + "step": 12338 + }, + { + "epoch": 0.16420134544477005, + "grad_norm": 0.9538365006446838, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5737, + "step": 12369 + }, + { + "epoch": 0.16461287763886723, + "grad_norm": 0.9336950778961182, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5821, + "step": 12400 + }, + { + "epoch": 0.1650244098329644, + "grad_norm": 0.9611359238624573, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5847, + "step": 12431 + }, + { + "epoch": 0.16543594202706155, + "grad_norm": 1.0813709497451782, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5805, + "step": 12462 + }, + { + "epoch": 0.16584747422115872, + "grad_norm": 0.9780453443527222, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5761, + "step": 12493 + }, + { + "epoch": 0.1662590064152559, + "grad_norm": 0.9629074335098267, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.58, + "step": 12524 + }, + { + "epoch": 0.16667053860935307, + "grad_norm": 0.9241612553596497, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5732, + "step": 12555 + }, + { + "epoch": 0.16708207080345022, + "grad_norm": 1.0314536094665527, + "learning_rate": 3.409364314116074e-05, + "loss": 0.578, + "step": 12586 + }, + { + "epoch": 0.1674936029975474, + "grad_norm": 0.9907404184341431, + "learning_rate": 3.401536249920559e-05, + "loss": 0.591, + "step": 12617 + }, + { + "epoch": 0.16790513519164457, + "grad_norm": 0.9442338943481445, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5754, + "step": 12648 + }, + { + "epoch": 0.16831666738574175, + "grad_norm": 0.9101460576057434, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5815, + "step": 12679 + }, + { + "epoch": 0.1687281995798389, + "grad_norm": 1.029105544090271, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5766, + "step": 12710 + }, + { + "epoch": 0.16913973177393607, + "grad_norm": 1.028542160987854, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5769, + "step": 12741 + }, + { + "epoch": 0.16955126396803324, + "grad_norm": 0.9411474466323853, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5808, + "step": 12772 + }, + { + "epoch": 0.16996279616213042, + "grad_norm": 0.9260859489440918, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5683, + "step": 12803 + }, + { + "epoch": 0.17037432835622757, + "grad_norm": 0.9867289066314697, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5781, + "step": 12834 + }, + { + "epoch": 0.17078586055032474, + "grad_norm": 0.9310885071754456, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5876, + "step": 12865 + }, + { + "epoch": 0.17119739274442192, + "grad_norm": 1.1955841779708862, + "learning_rate": 3.330636493090868e-05, + "loss": 0.577, + "step": 12896 + }, + { + "epoch": 0.1716089249385191, + "grad_norm": 1.0715487003326416, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5835, + "step": 12927 + }, + { + "epoch": 0.17202045713261624, + "grad_norm": 0.9181262850761414, + "learning_rate": 3.314775593945251e-05, + "loss": 0.5733, + "step": 12958 + }, + { + "epoch": 0.17243198932671341, + "grad_norm": 0.9595162868499756, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5685, + "step": 12989 + }, + { + "epoch": 0.1728435215208106, + "grad_norm": 1.057614803314209, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5736, + "step": 13020 + }, + { + "epoch": 0.17325505371490776, + "grad_norm": 0.9654355049133301, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5782, + "step": 13051 + }, + { + "epoch": 0.1736665859090049, + "grad_norm": 1.0919837951660156, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5771, + "step": 13082 + }, + { + "epoch": 0.1740781181031021, + "grad_norm": 1.064331293106079, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5731, + "step": 13113 + }, + { + "epoch": 0.17448965029719926, + "grad_norm": 0.8971393704414368, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5855, + "step": 13144 + }, + { + "epoch": 0.17490118249129644, + "grad_norm": 0.9515472054481506, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5776, + "step": 13175 + }, + { + "epoch": 0.17531271468539358, + "grad_norm": 0.9811455607414246, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5817, + "step": 13206 + }, + { + "epoch": 0.17572424687949076, + "grad_norm": 0.9187535643577576, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5751, + "step": 13237 + }, + { + "epoch": 0.17613577907358793, + "grad_norm": 0.9396876096725464, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5744, + "step": 13268 + }, + { + "epoch": 0.1765473112676851, + "grad_norm": 1.064921259880066, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5731, + "step": 13299 + }, + { + "epoch": 0.17695884346178226, + "grad_norm": 0.946753978729248, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5824, + "step": 13330 + }, + { + "epoch": 0.17737037565587943, + "grad_norm": 0.8623449206352234, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5764, + "step": 13361 + }, + { + "epoch": 0.1777819078499766, + "grad_norm": 0.9526584148406982, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5696, + "step": 13392 + }, + { + "epoch": 0.17819344004407378, + "grad_norm": 0.9413353800773621, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5688, + "step": 13423 + }, + { + "epoch": 0.17860497223817093, + "grad_norm": 0.9612322449684143, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5694, + "step": 13454 + }, + { + "epoch": 0.1790165044322681, + "grad_norm": 1.0289851427078247, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5799, + "step": 13485 + }, + { + "epoch": 0.17942803662636528, + "grad_norm": 0.9124505519866943, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5729, + "step": 13516 + }, + { + "epoch": 0.17983956882046245, + "grad_norm": 0.9874391555786133, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5761, + "step": 13547 + }, + { + "epoch": 0.1802511010145596, + "grad_norm": 1.0058220624923706, + "learning_rate": 3.154262717052985e-05, + "loss": 0.58, + "step": 13578 + }, + { + "epoch": 0.18066263320865678, + "grad_norm": 0.9753491282463074, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5749, + "step": 13609 + }, + { + "epoch": 0.18107416540275395, + "grad_norm": 1.0167535543441772, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5848, + "step": 13640 + }, + { + "epoch": 0.18148569759685113, + "grad_norm": 0.9970638155937195, + "learning_rate": 3.129913267924946e-05, + "loss": 0.5743, + "step": 13671 + }, + { + "epoch": 0.18189722979094827, + "grad_norm": 1.0033198595046997, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5747, + "step": 13702 + }, + { + "epoch": 0.18230876198504545, + "grad_norm": 1.0107535123825073, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5725, + "step": 13733 + }, + { + "epoch": 0.18272029417914262, + "grad_norm": 0.9341425895690918, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5726, + "step": 13764 + }, + { + "epoch": 0.1831318263732398, + "grad_norm": 1.0088342428207397, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5737, + "step": 13795 + }, + { + "epoch": 0.18354335856733694, + "grad_norm": 0.8871366381645203, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.576, + "step": 13826 + }, + { + "epoch": 0.18395489076143412, + "grad_norm": 0.971552848815918, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.57, + "step": 13857 + }, + { + "epoch": 0.1843664229555313, + "grad_norm": 1.1167266368865967, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5652, + "step": 13888 + }, + { + "epoch": 0.18477795514962847, + "grad_norm": 0.9768183827400208, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5671, + "step": 13919 + }, + { + "epoch": 0.18518948734372562, + "grad_norm": 1.0097460746765137, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5771, + "step": 13950 + }, + { + "epoch": 0.1856010195378228, + "grad_norm": 1.010986089706421, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5723, + "step": 13981 + }, + { + "epoch": 0.18601255173191997, + "grad_norm": 0.8861789107322693, + "learning_rate": 3.040103481317539e-05, + "loss": 0.5798, + "step": 14012 + }, + { + "epoch": 0.18642408392601714, + "grad_norm": 0.9811322689056396, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5703, + "step": 14043 + }, + { + "epoch": 0.1868356161201143, + "grad_norm": 0.9183774590492249, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5655, + "step": 14074 + }, + { + "epoch": 0.18724714831421146, + "grad_norm": 0.9655874371528625, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.57, + "step": 14105 + }, + { + "epoch": 0.18765868050830864, + "grad_norm": 0.9735666513442993, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5748, + "step": 14136 + }, + { + "epoch": 0.18807021270240581, + "grad_norm": 0.9084784388542175, + "learning_rate": 2.999029669712431e-05, + "loss": 0.568, + "step": 14167 + }, + { + "epoch": 0.18848174489650296, + "grad_norm": 0.8866302371025085, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5768, + "step": 14198 + }, + { + "epoch": 0.18889327709060014, + "grad_norm": 0.9936773180961609, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5701, + "step": 14229 + }, + { + "epoch": 0.1893048092846973, + "grad_norm": 0.9721146821975708, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5638, + "step": 14260 + }, + { + "epoch": 0.1897163414787945, + "grad_norm": 0.921503484249115, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5645, + "step": 14291 + }, + { + "epoch": 0.19012787367289163, + "grad_norm": 0.9194900989532471, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5724, + "step": 14322 + }, + { + "epoch": 0.1905394058669888, + "grad_norm": 0.9940860271453857, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5731, + "step": 14353 + }, + { + "epoch": 0.19095093806108598, + "grad_norm": 0.9163900017738342, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5761, + "step": 14384 + }, + { + "epoch": 0.19136247025518316, + "grad_norm": 0.960118293762207, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.572, + "step": 14415 + }, + { + "epoch": 0.1917740024492803, + "grad_norm": 0.9942934513092041, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5606, + "step": 14446 + }, + { + "epoch": 0.19218553464337748, + "grad_norm": 0.9903921484947205, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5704, + "step": 14477 + }, + { + "epoch": 0.19259706683747466, + "grad_norm": 0.9734652638435364, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.5768, + "step": 14508 + }, + { + "epoch": 0.19300859903157183, + "grad_norm": 0.9251719117164612, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.565, + "step": 14539 + }, + { + "epoch": 0.19342013122566898, + "grad_norm": 0.9734142422676086, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5702, + "step": 14570 + }, + { + "epoch": 0.19383166341976615, + "grad_norm": 0.9692136645317078, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5581, + "step": 14601 + }, + { + "epoch": 0.19424319561386333, + "grad_norm": 0.9654501676559448, + "learning_rate": 2.875010077160754e-05, + "loss": 0.5667, + "step": 14632 + }, + { + "epoch": 0.1946547278079605, + "grad_norm": 1.0234603881835938, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5688, + "step": 14663 + }, + { + "epoch": 0.19506626000205765, + "grad_norm": 0.9619661569595337, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5713, + "step": 14694 + }, + { + "epoch": 0.19547779219615483, + "grad_norm": 1.0468534231185913, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5702, + "step": 14725 + }, + { + "epoch": 0.195889324390252, + "grad_norm": 0.9849717617034912, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5695, + "step": 14756 + }, + { + "epoch": 0.19630085658434918, + "grad_norm": 0.8887643218040466, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5688, + "step": 14787 + }, + { + "epoch": 0.19671238877844632, + "grad_norm": 0.9583494067192078, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5775, + "step": 14818 + }, + { + "epoch": 0.1971239209725435, + "grad_norm": 0.9766852259635925, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5688, + "step": 14849 + }, + { + "epoch": 0.19753545316664067, + "grad_norm": 0.9450570940971375, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5773, + "step": 14880 + }, + { + "epoch": 0.19794698536073785, + "grad_norm": 0.9476996064186096, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5688, + "step": 14911 + }, + { + "epoch": 0.198358517554835, + "grad_norm": 1.0239835977554321, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5717, + "step": 14942 + }, + { + "epoch": 0.19877004974893217, + "grad_norm": 0.9848045110702515, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5691, + "step": 14973 + }, + { + "epoch": 0.19918158194302935, + "grad_norm": 1.0350494384765625, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5808, + "step": 15004 + }, + { + "epoch": 0.19959311413712652, + "grad_norm": 0.8743448853492737, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5694, + "step": 15035 + }, + { + "epoch": 0.20000464633122367, + "grad_norm": 0.9410389065742493, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5739, + "step": 15066 + }, + { + "epoch": 0.20041617852532084, + "grad_norm": 1.0113860368728638, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5689, + "step": 15097 + }, + { + "epoch": 0.20082771071941802, + "grad_norm": 0.998852550983429, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.5619, + "step": 15128 + }, + { + "epoch": 0.2012392429135152, + "grad_norm": 0.9299794435501099, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5672, + "step": 15159 + }, + { + "epoch": 0.20165077510761234, + "grad_norm": 1.0109282732009888, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5691, + "step": 15190 + }, + { + "epoch": 0.20206230730170952, + "grad_norm": 0.8805022835731506, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5601, + "step": 15221 + }, + { + "epoch": 0.2024738394958067, + "grad_norm": 0.8976924419403076, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5669, + "step": 15252 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1258280394232955e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15260/training_args.bin b/checkpoint-15260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-15260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-18312/config.json b/checkpoint-18312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-18312/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-18312/generation_config.json b/checkpoint-18312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-18312/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-18312/model-00001-of-00007.safetensors b/checkpoint-18312/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07b0448b0665263276020c2b0ce39bfd70f59f21 --- /dev/null +++ b/checkpoint-18312/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da583efd4b108a0cf4092795df246eab9009dc721e00caa77c8a7b89382c481 +size 4886466168 diff --git a/checkpoint-18312/model-00002-of-00007.safetensors b/checkpoint-18312/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-18312/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-18312/model-00003-of-00007.safetensors b/checkpoint-18312/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-18312/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-18312/model-00004-of-00007.safetensors b/checkpoint-18312/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-18312/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-18312/model-00005-of-00007.safetensors b/checkpoint-18312/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-18312/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-18312/model-00006-of-00007.safetensors b/checkpoint-18312/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c2cfa5073bb79dd7b0cb18a9db2ef180cef0d34b --- /dev/null +++ b/checkpoint-18312/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f261f76e4ce233bc88ed14286ede45f5659ba8e4998ff9d64dc54aec6c834254 +size 4999813120 diff --git a/checkpoint-18312/model-00007-of-00007.safetensors b/checkpoint-18312/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dca1080f640e71ac1e6a72d642474a1bef80b9d0 --- /dev/null +++ b/checkpoint-18312/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3ce2de917797dc9fc88acfd0dbf2eb74d9a30720ea31edf94ae68250f8fc973 +size 2571158184 diff --git a/checkpoint-18312/model.safetensors.index.json b/checkpoint-18312/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-18312/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-18312/optimizer.pt b/checkpoint-18312/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7e4accdbecc0b475593efa0dcb0d0bebd1102e1 --- /dev/null +++ b/checkpoint-18312/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c60452df8a312f7d6249784451c9d60a09cda5da56d0637a615ec6e2c069545 +size 15385036334 diff --git a/checkpoint-18312/rng_state.pth b/checkpoint-18312/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-18312/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-18312/scheduler.pt b/checkpoint-18312/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..59a8b46d1ac64fc3cd4c673b6051786fee3ed26d --- /dev/null +++ b/checkpoint-18312/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e65c3d6f29e706fd941a38280ce5628189a6998eac6d29abbeab00ad838d00 +size 1064 diff --git a/checkpoint-18312/trainer_state.json b/checkpoint-18312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a973d6ebfdde998dc97dad1a5a92bd06cf5f41fc --- /dev/null +++ b/checkpoint-18312/trainer_state.json @@ -0,0 +1,4163 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.24309604962281747, + "eval_steps": 500, + "global_step": 18312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + }, + { + "epoch": 0.1621436844742842, + "grad_norm": 0.952920138835907, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5843, + "step": 12214 + }, + { + "epoch": 0.16255521666838138, + "grad_norm": 0.9184344410896301, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5748, + "step": 12245 + }, + { + "epoch": 0.16296674886247856, + "grad_norm": 0.9797667264938354, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5788, + "step": 12276 + }, + { + "epoch": 0.16337828105657573, + "grad_norm": 0.968781054019928, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5835, + "step": 12307 + }, + { + "epoch": 0.16378981325067288, + "grad_norm": 1.0632243156433105, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5742, + "step": 12338 + }, + { + "epoch": 0.16420134544477005, + "grad_norm": 0.9538365006446838, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5737, + "step": 12369 + }, + { + "epoch": 0.16461287763886723, + "grad_norm": 0.9336950778961182, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5821, + "step": 12400 + }, + { + "epoch": 0.1650244098329644, + "grad_norm": 0.9611359238624573, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5847, + "step": 12431 + }, + { + "epoch": 0.16543594202706155, + "grad_norm": 1.0813709497451782, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5805, + "step": 12462 + }, + { + "epoch": 0.16584747422115872, + "grad_norm": 0.9780453443527222, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5761, + "step": 12493 + }, + { + "epoch": 0.1662590064152559, + "grad_norm": 0.9629074335098267, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.58, + "step": 12524 + }, + { + "epoch": 0.16667053860935307, + "grad_norm": 0.9241612553596497, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5732, + "step": 12555 + }, + { + "epoch": 0.16708207080345022, + "grad_norm": 1.0314536094665527, + "learning_rate": 3.409364314116074e-05, + "loss": 0.578, + "step": 12586 + }, + { + "epoch": 0.1674936029975474, + "grad_norm": 0.9907404184341431, + "learning_rate": 3.401536249920559e-05, + "loss": 0.591, + "step": 12617 + }, + { + "epoch": 0.16790513519164457, + "grad_norm": 0.9442338943481445, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5754, + "step": 12648 + }, + { + "epoch": 0.16831666738574175, + "grad_norm": 0.9101460576057434, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5815, + "step": 12679 + }, + { + "epoch": 0.1687281995798389, + "grad_norm": 1.029105544090271, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5766, + "step": 12710 + }, + { + "epoch": 0.16913973177393607, + "grad_norm": 1.028542160987854, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5769, + "step": 12741 + }, + { + "epoch": 0.16955126396803324, + "grad_norm": 0.9411474466323853, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5808, + "step": 12772 + }, + { + "epoch": 0.16996279616213042, + "grad_norm": 0.9260859489440918, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5683, + "step": 12803 + }, + { + "epoch": 0.17037432835622757, + "grad_norm": 0.9867289066314697, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5781, + "step": 12834 + }, + { + "epoch": 0.17078586055032474, + "grad_norm": 0.9310885071754456, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5876, + "step": 12865 + }, + { + "epoch": 0.17119739274442192, + "grad_norm": 1.1955841779708862, + "learning_rate": 3.330636493090868e-05, + "loss": 0.577, + "step": 12896 + }, + { + "epoch": 0.1716089249385191, + "grad_norm": 1.0715487003326416, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5835, + "step": 12927 + }, + { + "epoch": 0.17202045713261624, + "grad_norm": 0.9181262850761414, + "learning_rate": 3.314775593945251e-05, + "loss": 0.5733, + "step": 12958 + }, + { + "epoch": 0.17243198932671341, + "grad_norm": 0.9595162868499756, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5685, + "step": 12989 + }, + { + "epoch": 0.1728435215208106, + "grad_norm": 1.057614803314209, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5736, + "step": 13020 + }, + { + "epoch": 0.17325505371490776, + "grad_norm": 0.9654355049133301, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5782, + "step": 13051 + }, + { + "epoch": 0.1736665859090049, + "grad_norm": 1.0919837951660156, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5771, + "step": 13082 + }, + { + "epoch": 0.1740781181031021, + "grad_norm": 1.064331293106079, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5731, + "step": 13113 + }, + { + "epoch": 0.17448965029719926, + "grad_norm": 0.8971393704414368, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5855, + "step": 13144 + }, + { + "epoch": 0.17490118249129644, + "grad_norm": 0.9515472054481506, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5776, + "step": 13175 + }, + { + "epoch": 0.17531271468539358, + "grad_norm": 0.9811455607414246, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5817, + "step": 13206 + }, + { + "epoch": 0.17572424687949076, + "grad_norm": 0.9187535643577576, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5751, + "step": 13237 + }, + { + "epoch": 0.17613577907358793, + "grad_norm": 0.9396876096725464, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5744, + "step": 13268 + }, + { + "epoch": 0.1765473112676851, + "grad_norm": 1.064921259880066, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5731, + "step": 13299 + }, + { + "epoch": 0.17695884346178226, + "grad_norm": 0.946753978729248, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5824, + "step": 13330 + }, + { + "epoch": 0.17737037565587943, + "grad_norm": 0.8623449206352234, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5764, + "step": 13361 + }, + { + "epoch": 0.1777819078499766, + "grad_norm": 0.9526584148406982, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5696, + "step": 13392 + }, + { + "epoch": 0.17819344004407378, + "grad_norm": 0.9413353800773621, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5688, + "step": 13423 + }, + { + "epoch": 0.17860497223817093, + "grad_norm": 0.9612322449684143, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5694, + "step": 13454 + }, + { + "epoch": 0.1790165044322681, + "grad_norm": 1.0289851427078247, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5799, + "step": 13485 + }, + { + "epoch": 0.17942803662636528, + "grad_norm": 0.9124505519866943, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5729, + "step": 13516 + }, + { + "epoch": 0.17983956882046245, + "grad_norm": 0.9874391555786133, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5761, + "step": 13547 + }, + { + "epoch": 0.1802511010145596, + "grad_norm": 1.0058220624923706, + "learning_rate": 3.154262717052985e-05, + "loss": 0.58, + "step": 13578 + }, + { + "epoch": 0.18066263320865678, + "grad_norm": 0.9753491282463074, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5749, + "step": 13609 + }, + { + "epoch": 0.18107416540275395, + "grad_norm": 1.0167535543441772, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5848, + "step": 13640 + }, + { + "epoch": 0.18148569759685113, + "grad_norm": 0.9970638155937195, + "learning_rate": 3.129913267924946e-05, + "loss": 0.5743, + "step": 13671 + }, + { + "epoch": 0.18189722979094827, + "grad_norm": 1.0033198595046997, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5747, + "step": 13702 + }, + { + "epoch": 0.18230876198504545, + "grad_norm": 1.0107535123825073, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5725, + "step": 13733 + }, + { + "epoch": 0.18272029417914262, + "grad_norm": 0.9341425895690918, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5726, + "step": 13764 + }, + { + "epoch": 0.1831318263732398, + "grad_norm": 1.0088342428207397, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5737, + "step": 13795 + }, + { + "epoch": 0.18354335856733694, + "grad_norm": 0.8871366381645203, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.576, + "step": 13826 + }, + { + "epoch": 0.18395489076143412, + "grad_norm": 0.971552848815918, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.57, + "step": 13857 + }, + { + "epoch": 0.1843664229555313, + "grad_norm": 1.1167266368865967, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5652, + "step": 13888 + }, + { + "epoch": 0.18477795514962847, + "grad_norm": 0.9768183827400208, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5671, + "step": 13919 + }, + { + "epoch": 0.18518948734372562, + "grad_norm": 1.0097460746765137, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5771, + "step": 13950 + }, + { + "epoch": 0.1856010195378228, + "grad_norm": 1.010986089706421, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5723, + "step": 13981 + }, + { + "epoch": 0.18601255173191997, + "grad_norm": 0.8861789107322693, + "learning_rate": 3.040103481317539e-05, + "loss": 0.5798, + "step": 14012 + }, + { + "epoch": 0.18642408392601714, + "grad_norm": 0.9811322689056396, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5703, + "step": 14043 + }, + { + "epoch": 0.1868356161201143, + "grad_norm": 0.9183774590492249, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5655, + "step": 14074 + }, + { + "epoch": 0.18724714831421146, + "grad_norm": 0.9655874371528625, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.57, + "step": 14105 + }, + { + "epoch": 0.18765868050830864, + "grad_norm": 0.9735666513442993, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5748, + "step": 14136 + }, + { + "epoch": 0.18807021270240581, + "grad_norm": 0.9084784388542175, + "learning_rate": 2.999029669712431e-05, + "loss": 0.568, + "step": 14167 + }, + { + "epoch": 0.18848174489650296, + "grad_norm": 0.8866302371025085, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5768, + "step": 14198 + }, + { + "epoch": 0.18889327709060014, + "grad_norm": 0.9936773180961609, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5701, + "step": 14229 + }, + { + "epoch": 0.1893048092846973, + "grad_norm": 0.9721146821975708, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5638, + "step": 14260 + }, + { + "epoch": 0.1897163414787945, + "grad_norm": 0.921503484249115, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5645, + "step": 14291 + }, + { + "epoch": 0.19012787367289163, + "grad_norm": 0.9194900989532471, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5724, + "step": 14322 + }, + { + "epoch": 0.1905394058669888, + "grad_norm": 0.9940860271453857, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5731, + "step": 14353 + }, + { + "epoch": 0.19095093806108598, + "grad_norm": 0.9163900017738342, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5761, + "step": 14384 + }, + { + "epoch": 0.19136247025518316, + "grad_norm": 0.960118293762207, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.572, + "step": 14415 + }, + { + "epoch": 0.1917740024492803, + "grad_norm": 0.9942934513092041, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5606, + "step": 14446 + }, + { + "epoch": 0.19218553464337748, + "grad_norm": 0.9903921484947205, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5704, + "step": 14477 + }, + { + "epoch": 0.19259706683747466, + "grad_norm": 0.9734652638435364, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.5768, + "step": 14508 + }, + { + "epoch": 0.19300859903157183, + "grad_norm": 0.9251719117164612, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.565, + "step": 14539 + }, + { + "epoch": 0.19342013122566898, + "grad_norm": 0.9734142422676086, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5702, + "step": 14570 + }, + { + "epoch": 0.19383166341976615, + "grad_norm": 0.9692136645317078, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5581, + "step": 14601 + }, + { + "epoch": 0.19424319561386333, + "grad_norm": 0.9654501676559448, + "learning_rate": 2.875010077160754e-05, + "loss": 0.5667, + "step": 14632 + }, + { + "epoch": 0.1946547278079605, + "grad_norm": 1.0234603881835938, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5688, + "step": 14663 + }, + { + "epoch": 0.19506626000205765, + "grad_norm": 0.9619661569595337, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5713, + "step": 14694 + }, + { + "epoch": 0.19547779219615483, + "grad_norm": 1.0468534231185913, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5702, + "step": 14725 + }, + { + "epoch": 0.195889324390252, + "grad_norm": 0.9849717617034912, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5695, + "step": 14756 + }, + { + "epoch": 0.19630085658434918, + "grad_norm": 0.8887643218040466, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5688, + "step": 14787 + }, + { + "epoch": 0.19671238877844632, + "grad_norm": 0.9583494067192078, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5775, + "step": 14818 + }, + { + "epoch": 0.1971239209725435, + "grad_norm": 0.9766852259635925, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5688, + "step": 14849 + }, + { + "epoch": 0.19753545316664067, + "grad_norm": 0.9450570940971375, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5773, + "step": 14880 + }, + { + "epoch": 0.19794698536073785, + "grad_norm": 0.9476996064186096, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5688, + "step": 14911 + }, + { + "epoch": 0.198358517554835, + "grad_norm": 1.0239835977554321, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5717, + "step": 14942 + }, + { + "epoch": 0.19877004974893217, + "grad_norm": 0.9848045110702515, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5691, + "step": 14973 + }, + { + "epoch": 0.19918158194302935, + "grad_norm": 1.0350494384765625, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5808, + "step": 15004 + }, + { + "epoch": 0.19959311413712652, + "grad_norm": 0.8743448853492737, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5694, + "step": 15035 + }, + { + "epoch": 0.20000464633122367, + "grad_norm": 0.9410389065742493, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5739, + "step": 15066 + }, + { + "epoch": 0.20041617852532084, + "grad_norm": 1.0113860368728638, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5689, + "step": 15097 + }, + { + "epoch": 0.20082771071941802, + "grad_norm": 0.998852550983429, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.5619, + "step": 15128 + }, + { + "epoch": 0.2012392429135152, + "grad_norm": 0.9299794435501099, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5672, + "step": 15159 + }, + { + "epoch": 0.20165077510761234, + "grad_norm": 1.0109282732009888, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5691, + "step": 15190 + }, + { + "epoch": 0.20206230730170952, + "grad_norm": 0.8805022835731506, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5601, + "step": 15221 + }, + { + "epoch": 0.2024738394958067, + "grad_norm": 0.8976924419403076, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5669, + "step": 15252 + }, + { + "epoch": 0.20288537168990387, + "grad_norm": 0.8639585375785828, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5713, + "step": 15283 + }, + { + "epoch": 0.203296903884001, + "grad_norm": 0.9253800511360168, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5634, + "step": 15314 + }, + { + "epoch": 0.2037084360780982, + "grad_norm": 0.8547073006629944, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5705, + "step": 15345 + }, + { + "epoch": 0.20411996827219536, + "grad_norm": 0.8723642230033875, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.573, + "step": 15376 + }, + { + "epoch": 0.20453150046629254, + "grad_norm": 0.9164481163024902, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5645, + "step": 15407 + }, + { + "epoch": 0.20494303266038968, + "grad_norm": 0.9538819193840027, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5677, + "step": 15438 + }, + { + "epoch": 0.20535456485448686, + "grad_norm": 0.8995161652565002, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5683, + "step": 15469 + }, + { + "epoch": 0.20576609704858403, + "grad_norm": 0.9026926755905151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5694, + "step": 15500 + }, + { + "epoch": 0.2061776292426812, + "grad_norm": 0.9095093011856079, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5722, + "step": 15531 + }, + { + "epoch": 0.20658916143677836, + "grad_norm": 0.874626636505127, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5652, + "step": 15562 + }, + { + "epoch": 0.20700069363087553, + "grad_norm": 1.0359785556793213, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.57, + "step": 15593 + }, + { + "epoch": 0.2074122258249727, + "grad_norm": 0.9145928621292114, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5637, + "step": 15624 + }, + { + "epoch": 0.20782375801906988, + "grad_norm": 1.020246982574463, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5619, + "step": 15655 + }, + { + "epoch": 0.20823529021316703, + "grad_norm": 0.8766633868217468, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.5631, + "step": 15686 + }, + { + "epoch": 0.2086468224072642, + "grad_norm": 0.9841639399528503, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.5598, + "step": 15717 + }, + { + "epoch": 0.20905835460136138, + "grad_norm": 0.8983998894691467, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5723, + "step": 15748 + }, + { + "epoch": 0.20946988679545855, + "grad_norm": 0.8868324756622314, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5801, + "step": 15779 + }, + { + "epoch": 0.2098814189895557, + "grad_norm": 0.9000539183616638, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5672, + "step": 15810 + }, + { + "epoch": 0.21029295118365288, + "grad_norm": 0.9193928837776184, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.5658, + "step": 15841 + }, + { + "epoch": 0.21070448337775005, + "grad_norm": 0.9424473643302917, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5685, + "step": 15872 + }, + { + "epoch": 0.21111601557184723, + "grad_norm": 0.9552715420722961, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5725, + "step": 15903 + }, + { + "epoch": 0.21152754776594437, + "grad_norm": 0.8888420462608337, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5649, + "step": 15934 + }, + { + "epoch": 0.21193907996004155, + "grad_norm": 0.906830370426178, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5692, + "step": 15965 + }, + { + "epoch": 0.21235061215413872, + "grad_norm": 0.8939186334609985, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5665, + "step": 15996 + }, + { + "epoch": 0.2127621443482359, + "grad_norm": 1.0149410963058472, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5632, + "step": 16027 + }, + { + "epoch": 0.21317367654233305, + "grad_norm": 0.963056206703186, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5656, + "step": 16058 + }, + { + "epoch": 0.21358520873643022, + "grad_norm": 0.8071532249450684, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5667, + "step": 16089 + }, + { + "epoch": 0.2139967409305274, + "grad_norm": 0.9192640781402588, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5709, + "step": 16120 + }, + { + "epoch": 0.21440827312462457, + "grad_norm": 0.84633868932724, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5658, + "step": 16151 + }, + { + "epoch": 0.21481980531872172, + "grad_norm": 0.8883370757102966, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.5683, + "step": 16182 + }, + { + "epoch": 0.2152313375128189, + "grad_norm": 0.8919095396995544, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.568, + "step": 16213 + }, + { + "epoch": 0.21564286970691607, + "grad_norm": 0.9360633492469788, + "learning_rate": 2.439728136286796e-05, + "loss": 0.565, + "step": 16244 + }, + { + "epoch": 0.21605440190101324, + "grad_norm": 0.9496976733207703, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5627, + "step": 16275 + }, + { + "epoch": 0.2164659340951104, + "grad_norm": 0.9771477580070496, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5614, + "step": 16306 + }, + { + "epoch": 0.21687746628920757, + "grad_norm": 0.931249737739563, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5649, + "step": 16337 + }, + { + "epoch": 0.21728899848330474, + "grad_norm": 0.9592285752296448, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5646, + "step": 16368 + }, + { + "epoch": 0.21770053067740192, + "grad_norm": 0.9159988164901733, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5672, + "step": 16399 + }, + { + "epoch": 0.21811206287149906, + "grad_norm": 0.97376549243927, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5607, + "step": 16430 + }, + { + "epoch": 0.21852359506559624, + "grad_norm": 0.8469638824462891, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5597, + "step": 16461 + }, + { + "epoch": 0.2189351272596934, + "grad_norm": 1.030610203742981, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5625, + "step": 16492 + }, + { + "epoch": 0.2193466594537906, + "grad_norm": 0.9524822235107422, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5621, + "step": 16523 + }, + { + "epoch": 0.21975819164788774, + "grad_norm": 0.9608604311943054, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5672, + "step": 16554 + }, + { + "epoch": 0.2201697238419849, + "grad_norm": 0.9253712296485901, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5609, + "step": 16585 + }, + { + "epoch": 0.22058125603608209, + "grad_norm": 0.923402726650238, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5654, + "step": 16616 + }, + { + "epoch": 0.22099278823017926, + "grad_norm": 1.0030287504196167, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5576, + "step": 16647 + }, + { + "epoch": 0.2214043204242764, + "grad_norm": 0.9106613993644714, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5653, + "step": 16678 + }, + { + "epoch": 0.22181585261837358, + "grad_norm": 1.0058101415634155, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5749, + "step": 16709 + }, + { + "epoch": 0.22222738481247076, + "grad_norm": 0.931086540222168, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5637, + "step": 16740 + }, + { + "epoch": 0.22263891700656793, + "grad_norm": 0.9743716716766357, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5718, + "step": 16771 + }, + { + "epoch": 0.22305044920066508, + "grad_norm": 0.8751611709594727, + "learning_rate": 2.288805948824212e-05, + "loss": 0.5581, + "step": 16802 + }, + { + "epoch": 0.22346198139476225, + "grad_norm": 0.867038905620575, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5664, + "step": 16833 + }, + { + "epoch": 0.22387351358885943, + "grad_norm": 0.8663344383239746, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.559, + "step": 16864 + }, + { + "epoch": 0.2242850457829566, + "grad_norm": 0.984854519367218, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5626, + "step": 16895 + }, + { + "epoch": 0.22469657797705375, + "grad_norm": 0.9031103849411011, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5666, + "step": 16926 + }, + { + "epoch": 0.22510811017115093, + "grad_norm": 0.8782587647438049, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5501, + "step": 16957 + }, + { + "epoch": 0.2255196423652481, + "grad_norm": 1.0644887685775757, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.5604, + "step": 16988 + }, + { + "epoch": 0.22593117455934528, + "grad_norm": 0.8691216111183167, + "learning_rate": 2.230292185905114e-05, + "loss": 0.5649, + "step": 17019 + }, + { + "epoch": 0.22634270675344242, + "grad_norm": 0.9518167972564697, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.5598, + "step": 17050 + }, + { + "epoch": 0.2267542389475396, + "grad_norm": 0.889673113822937, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5626, + "step": 17081 + }, + { + "epoch": 0.22716577114163677, + "grad_norm": 0.9073772430419922, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5607, + "step": 17112 + }, + { + "epoch": 0.22757730333573395, + "grad_norm": 0.9674621820449829, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5651, + "step": 17143 + }, + { + "epoch": 0.2279888355298311, + "grad_norm": 0.8547524809837341, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5636, + "step": 17174 + }, + { + "epoch": 0.22840036772392827, + "grad_norm": 1.00649893283844, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5597, + "step": 17205 + }, + { + "epoch": 0.22881189991802545, + "grad_norm": 0.9329107999801636, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5705, + "step": 17236 + }, + { + "epoch": 0.22922343211212262, + "grad_norm": 1.0364869832992554, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.56, + "step": 17267 + }, + { + "epoch": 0.22963496430621977, + "grad_norm": 0.898383617401123, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5634, + "step": 17298 + }, + { + "epoch": 0.23004649650031694, + "grad_norm": 0.903266429901123, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5692, + "step": 17329 + }, + { + "epoch": 0.23045802869441412, + "grad_norm": 0.835216224193573, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5619, + "step": 17360 + }, + { + "epoch": 0.2308695608885113, + "grad_norm": 0.9033771753311157, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5661, + "step": 17391 + }, + { + "epoch": 0.23128109308260844, + "grad_norm": 0.8425393104553223, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5607, + "step": 17422 + }, + { + "epoch": 0.23169262527670562, + "grad_norm": 0.8765662908554077, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5654, + "step": 17453 + }, + { + "epoch": 0.2321041574708028, + "grad_norm": 0.8663944602012634, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5644, + "step": 17484 + }, + { + "epoch": 0.23251568966489997, + "grad_norm": 0.9837983846664429, + "learning_rate": 2.097158366805287e-05, + "loss": 0.5665, + "step": 17515 + }, + { + "epoch": 0.2329272218589971, + "grad_norm": 0.9082325100898743, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5671, + "step": 17546 + }, + { + "epoch": 0.2333387540530943, + "grad_norm": 0.9680993556976318, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5602, + "step": 17577 + }, + { + "epoch": 0.23375028624719146, + "grad_norm": 0.9881089925765991, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5682, + "step": 17608 + }, + { + "epoch": 0.23416181844128864, + "grad_norm": 0.8630657196044922, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5646, + "step": 17639 + }, + { + "epoch": 0.23457335063538579, + "grad_norm": 0.8421202301979065, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.564, + "step": 17670 + }, + { + "epoch": 0.23498488282948296, + "grad_norm": 0.8951789736747742, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5594, + "step": 17701 + }, + { + "epoch": 0.23539641502358014, + "grad_norm": 1.0024628639221191, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5541, + "step": 17732 + }, + { + "epoch": 0.2358079472176773, + "grad_norm": 0.8807896971702576, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5605, + "step": 17763 + }, + { + "epoch": 0.23621947941177446, + "grad_norm": 0.9723889827728271, + "learning_rate": 2.022757379528727e-05, + "loss": 0.559, + "step": 17794 + }, + { + "epoch": 0.23663101160587163, + "grad_norm": 0.9422227740287781, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5594, + "step": 17825 + }, + { + "epoch": 0.2370425437999688, + "grad_norm": 0.9309141039848328, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5621, + "step": 17856 + }, + { + "epoch": 0.23745407599406598, + "grad_norm": 0.8761610388755798, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5665, + "step": 17887 + }, + { + "epoch": 0.23786560818816313, + "grad_norm": 0.8991973400115967, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5726, + "step": 17918 + }, + { + "epoch": 0.2382771403822603, + "grad_norm": 0.8879802227020264, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5598, + "step": 17949 + }, + { + "epoch": 0.23868867257635748, + "grad_norm": 0.9235663414001465, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5637, + "step": 17980 + }, + { + "epoch": 0.23910020477045466, + "grad_norm": 0.9140569567680359, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5571, + "step": 18011 + }, + { + "epoch": 0.2395117369645518, + "grad_norm": 0.933430016040802, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.561, + "step": 18042 + }, + { + "epoch": 0.23992326915864898, + "grad_norm": 0.838374674320221, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5634, + "step": 18073 + }, + { + "epoch": 0.24033480135274615, + "grad_norm": 0.9295237064361572, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5604, + "step": 18104 + }, + { + "epoch": 0.24074633354684333, + "grad_norm": 0.880237340927124, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5609, + "step": 18135 + }, + { + "epoch": 0.24115786574094047, + "grad_norm": 0.9782423973083496, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5479, + "step": 18166 + }, + { + "epoch": 0.24156939793503765, + "grad_norm": 0.97150719165802, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5624, + "step": 18197 + }, + { + "epoch": 0.24198093012913482, + "grad_norm": 0.9634605050086975, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5661, + "step": 18228 + }, + { + "epoch": 0.242392462323232, + "grad_norm": 0.8706396222114563, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5582, + "step": 18259 + }, + { + "epoch": 0.24280399451732915, + "grad_norm": 0.9348079562187195, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5661, + "step": 18290 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3509936473079546e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18312/training_args.bin b/checkpoint-18312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-18312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-21364/config.json b/checkpoint-21364/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-21364/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-21364/generation_config.json b/checkpoint-21364/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-21364/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-21364/model-00001-of-00007.safetensors b/checkpoint-21364/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ab9c460dfb4c869ced38bab991dccdeeb118f5b0 --- /dev/null +++ b/checkpoint-21364/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b23c1daf15a450da6427a6c1c9869383ab8e27cc46ad11eed8fb2a3f2c16d06 +size 4886466168 diff --git a/checkpoint-21364/model-00002-of-00007.safetensors b/checkpoint-21364/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-21364/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-21364/model-00003-of-00007.safetensors b/checkpoint-21364/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-21364/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-21364/model-00004-of-00007.safetensors b/checkpoint-21364/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-21364/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-21364/model-00005-of-00007.safetensors b/checkpoint-21364/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-21364/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-21364/model-00006-of-00007.safetensors b/checkpoint-21364/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04aa56d960f644786ff178ee528f9702b082f6cf --- /dev/null +++ b/checkpoint-21364/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e664bd156e77688ba38833095fb733eba5bab41914ca81de79bec40cc8c572d0 +size 4999813120 diff --git a/checkpoint-21364/model-00007-of-00007.safetensors b/checkpoint-21364/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1e0a8ee9863f15eb898df4ae2ccbe9a811198925 --- /dev/null +++ b/checkpoint-21364/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18f7c2692522d62a6a5c6ef359cabb26f618e15a76a999501b6277b7e7091f79 +size 2571158184 diff --git a/checkpoint-21364/model.safetensors.index.json b/checkpoint-21364/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-21364/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-21364/optimizer.pt b/checkpoint-21364/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f61b4fa32fedba7bd690be044258c1cc55764703 --- /dev/null +++ b/checkpoint-21364/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba8d424ee3d06bcc736be9227d46d207b3f30fa504829d880cb9fa79caa7271 +size 15385036334 diff --git a/checkpoint-21364/rng_state.pth b/checkpoint-21364/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-21364/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-21364/scheduler.pt b/checkpoint-21364/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17783d26dc88c55a75e7564f8dcbad9eacfa9913 --- /dev/null +++ b/checkpoint-21364/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2827eb82750c76bd3279b469098a24605426f9a47a96b155384bcef2e3f4fe20 +size 1064 diff --git a/checkpoint-21364/trainer_state.json b/checkpoint-21364/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a397d2c28703216d65e396c574ceb481cec3215b --- /dev/null +++ b/checkpoint-21364/trainer_state.json @@ -0,0 +1,4856 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.28361205789328703, + "eval_steps": 500, + "global_step": 21364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + }, + { + "epoch": 0.1621436844742842, + "grad_norm": 0.952920138835907, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5843, + "step": 12214 + }, + { + "epoch": 0.16255521666838138, + "grad_norm": 0.9184344410896301, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5748, + "step": 12245 + }, + { + "epoch": 0.16296674886247856, + "grad_norm": 0.9797667264938354, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5788, + "step": 12276 + }, + { + "epoch": 0.16337828105657573, + "grad_norm": 0.968781054019928, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5835, + "step": 12307 + }, + { + "epoch": 0.16378981325067288, + "grad_norm": 1.0632243156433105, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5742, + "step": 12338 + }, + { + "epoch": 0.16420134544477005, + "grad_norm": 0.9538365006446838, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5737, + "step": 12369 + }, + { + "epoch": 0.16461287763886723, + "grad_norm": 0.9336950778961182, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5821, + "step": 12400 + }, + { + "epoch": 0.1650244098329644, + "grad_norm": 0.9611359238624573, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5847, + "step": 12431 + }, + { + "epoch": 0.16543594202706155, + "grad_norm": 1.0813709497451782, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5805, + "step": 12462 + }, + { + "epoch": 0.16584747422115872, + "grad_norm": 0.9780453443527222, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5761, + "step": 12493 + }, + { + "epoch": 0.1662590064152559, + "grad_norm": 0.9629074335098267, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.58, + "step": 12524 + }, + { + "epoch": 0.16667053860935307, + "grad_norm": 0.9241612553596497, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5732, + "step": 12555 + }, + { + "epoch": 0.16708207080345022, + "grad_norm": 1.0314536094665527, + "learning_rate": 3.409364314116074e-05, + "loss": 0.578, + "step": 12586 + }, + { + "epoch": 0.1674936029975474, + "grad_norm": 0.9907404184341431, + "learning_rate": 3.401536249920559e-05, + "loss": 0.591, + "step": 12617 + }, + { + "epoch": 0.16790513519164457, + "grad_norm": 0.9442338943481445, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5754, + "step": 12648 + }, + { + "epoch": 0.16831666738574175, + "grad_norm": 0.9101460576057434, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5815, + "step": 12679 + }, + { + "epoch": 0.1687281995798389, + "grad_norm": 1.029105544090271, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5766, + "step": 12710 + }, + { + "epoch": 0.16913973177393607, + "grad_norm": 1.028542160987854, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5769, + "step": 12741 + }, + { + "epoch": 0.16955126396803324, + "grad_norm": 0.9411474466323853, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5808, + "step": 12772 + }, + { + "epoch": 0.16996279616213042, + "grad_norm": 0.9260859489440918, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5683, + "step": 12803 + }, + { + "epoch": 0.17037432835622757, + "grad_norm": 0.9867289066314697, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5781, + "step": 12834 + }, + { + "epoch": 0.17078586055032474, + "grad_norm": 0.9310885071754456, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5876, + "step": 12865 + }, + { + "epoch": 0.17119739274442192, + "grad_norm": 1.1955841779708862, + "learning_rate": 3.330636493090868e-05, + "loss": 0.577, + "step": 12896 + }, + { + "epoch": 0.1716089249385191, + "grad_norm": 1.0715487003326416, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5835, + "step": 12927 + }, + { + "epoch": 0.17202045713261624, + "grad_norm": 0.9181262850761414, + "learning_rate": 3.314775593945251e-05, + "loss": 0.5733, + "step": 12958 + }, + { + "epoch": 0.17243198932671341, + "grad_norm": 0.9595162868499756, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5685, + "step": 12989 + }, + { + "epoch": 0.1728435215208106, + "grad_norm": 1.057614803314209, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5736, + "step": 13020 + }, + { + "epoch": 0.17325505371490776, + "grad_norm": 0.9654355049133301, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5782, + "step": 13051 + }, + { + "epoch": 0.1736665859090049, + "grad_norm": 1.0919837951660156, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5771, + "step": 13082 + }, + { + "epoch": 0.1740781181031021, + "grad_norm": 1.064331293106079, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5731, + "step": 13113 + }, + { + "epoch": 0.17448965029719926, + "grad_norm": 0.8971393704414368, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5855, + "step": 13144 + }, + { + "epoch": 0.17490118249129644, + "grad_norm": 0.9515472054481506, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5776, + "step": 13175 + }, + { + "epoch": 0.17531271468539358, + "grad_norm": 0.9811455607414246, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5817, + "step": 13206 + }, + { + "epoch": 0.17572424687949076, + "grad_norm": 0.9187535643577576, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5751, + "step": 13237 + }, + { + "epoch": 0.17613577907358793, + "grad_norm": 0.9396876096725464, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5744, + "step": 13268 + }, + { + "epoch": 0.1765473112676851, + "grad_norm": 1.064921259880066, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5731, + "step": 13299 + }, + { + "epoch": 0.17695884346178226, + "grad_norm": 0.946753978729248, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5824, + "step": 13330 + }, + { + "epoch": 0.17737037565587943, + "grad_norm": 0.8623449206352234, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5764, + "step": 13361 + }, + { + "epoch": 0.1777819078499766, + "grad_norm": 0.9526584148406982, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5696, + "step": 13392 + }, + { + "epoch": 0.17819344004407378, + "grad_norm": 0.9413353800773621, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5688, + "step": 13423 + }, + { + "epoch": 0.17860497223817093, + "grad_norm": 0.9612322449684143, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5694, + "step": 13454 + }, + { + "epoch": 0.1790165044322681, + "grad_norm": 1.0289851427078247, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5799, + "step": 13485 + }, + { + "epoch": 0.17942803662636528, + "grad_norm": 0.9124505519866943, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5729, + "step": 13516 + }, + { + "epoch": 0.17983956882046245, + "grad_norm": 0.9874391555786133, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5761, + "step": 13547 + }, + { + "epoch": 0.1802511010145596, + "grad_norm": 1.0058220624923706, + "learning_rate": 3.154262717052985e-05, + "loss": 0.58, + "step": 13578 + }, + { + "epoch": 0.18066263320865678, + "grad_norm": 0.9753491282463074, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5749, + "step": 13609 + }, + { + "epoch": 0.18107416540275395, + "grad_norm": 1.0167535543441772, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5848, + "step": 13640 + }, + { + "epoch": 0.18148569759685113, + "grad_norm": 0.9970638155937195, + "learning_rate": 3.129913267924946e-05, + "loss": 0.5743, + "step": 13671 + }, + { + "epoch": 0.18189722979094827, + "grad_norm": 1.0033198595046997, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5747, + "step": 13702 + }, + { + "epoch": 0.18230876198504545, + "grad_norm": 1.0107535123825073, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5725, + "step": 13733 + }, + { + "epoch": 0.18272029417914262, + "grad_norm": 0.9341425895690918, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5726, + "step": 13764 + }, + { + "epoch": 0.1831318263732398, + "grad_norm": 1.0088342428207397, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5737, + "step": 13795 + }, + { + "epoch": 0.18354335856733694, + "grad_norm": 0.8871366381645203, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.576, + "step": 13826 + }, + { + "epoch": 0.18395489076143412, + "grad_norm": 0.971552848815918, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.57, + "step": 13857 + }, + { + "epoch": 0.1843664229555313, + "grad_norm": 1.1167266368865967, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5652, + "step": 13888 + }, + { + "epoch": 0.18477795514962847, + "grad_norm": 0.9768183827400208, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5671, + "step": 13919 + }, + { + "epoch": 0.18518948734372562, + "grad_norm": 1.0097460746765137, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5771, + "step": 13950 + }, + { + "epoch": 0.1856010195378228, + "grad_norm": 1.010986089706421, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5723, + "step": 13981 + }, + { + "epoch": 0.18601255173191997, + "grad_norm": 0.8861789107322693, + "learning_rate": 3.040103481317539e-05, + "loss": 0.5798, + "step": 14012 + }, + { + "epoch": 0.18642408392601714, + "grad_norm": 0.9811322689056396, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5703, + "step": 14043 + }, + { + "epoch": 0.1868356161201143, + "grad_norm": 0.9183774590492249, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5655, + "step": 14074 + }, + { + "epoch": 0.18724714831421146, + "grad_norm": 0.9655874371528625, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.57, + "step": 14105 + }, + { + "epoch": 0.18765868050830864, + "grad_norm": 0.9735666513442993, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5748, + "step": 14136 + }, + { + "epoch": 0.18807021270240581, + "grad_norm": 0.9084784388542175, + "learning_rate": 2.999029669712431e-05, + "loss": 0.568, + "step": 14167 + }, + { + "epoch": 0.18848174489650296, + "grad_norm": 0.8866302371025085, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5768, + "step": 14198 + }, + { + "epoch": 0.18889327709060014, + "grad_norm": 0.9936773180961609, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5701, + "step": 14229 + }, + { + "epoch": 0.1893048092846973, + "grad_norm": 0.9721146821975708, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5638, + "step": 14260 + }, + { + "epoch": 0.1897163414787945, + "grad_norm": 0.921503484249115, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5645, + "step": 14291 + }, + { + "epoch": 0.19012787367289163, + "grad_norm": 0.9194900989532471, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5724, + "step": 14322 + }, + { + "epoch": 0.1905394058669888, + "grad_norm": 0.9940860271453857, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5731, + "step": 14353 + }, + { + "epoch": 0.19095093806108598, + "grad_norm": 0.9163900017738342, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5761, + "step": 14384 + }, + { + "epoch": 0.19136247025518316, + "grad_norm": 0.960118293762207, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.572, + "step": 14415 + }, + { + "epoch": 0.1917740024492803, + "grad_norm": 0.9942934513092041, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5606, + "step": 14446 + }, + { + "epoch": 0.19218553464337748, + "grad_norm": 0.9903921484947205, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5704, + "step": 14477 + }, + { + "epoch": 0.19259706683747466, + "grad_norm": 0.9734652638435364, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.5768, + "step": 14508 + }, + { + "epoch": 0.19300859903157183, + "grad_norm": 0.9251719117164612, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.565, + "step": 14539 + }, + { + "epoch": 0.19342013122566898, + "grad_norm": 0.9734142422676086, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5702, + "step": 14570 + }, + { + "epoch": 0.19383166341976615, + "grad_norm": 0.9692136645317078, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5581, + "step": 14601 + }, + { + "epoch": 0.19424319561386333, + "grad_norm": 0.9654501676559448, + "learning_rate": 2.875010077160754e-05, + "loss": 0.5667, + "step": 14632 + }, + { + "epoch": 0.1946547278079605, + "grad_norm": 1.0234603881835938, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5688, + "step": 14663 + }, + { + "epoch": 0.19506626000205765, + "grad_norm": 0.9619661569595337, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5713, + "step": 14694 + }, + { + "epoch": 0.19547779219615483, + "grad_norm": 1.0468534231185913, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5702, + "step": 14725 + }, + { + "epoch": 0.195889324390252, + "grad_norm": 0.9849717617034912, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5695, + "step": 14756 + }, + { + "epoch": 0.19630085658434918, + "grad_norm": 0.8887643218040466, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5688, + "step": 14787 + }, + { + "epoch": 0.19671238877844632, + "grad_norm": 0.9583494067192078, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5775, + "step": 14818 + }, + { + "epoch": 0.1971239209725435, + "grad_norm": 0.9766852259635925, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5688, + "step": 14849 + }, + { + "epoch": 0.19753545316664067, + "grad_norm": 0.9450570940971375, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5773, + "step": 14880 + }, + { + "epoch": 0.19794698536073785, + "grad_norm": 0.9476996064186096, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5688, + "step": 14911 + }, + { + "epoch": 0.198358517554835, + "grad_norm": 1.0239835977554321, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5717, + "step": 14942 + }, + { + "epoch": 0.19877004974893217, + "grad_norm": 0.9848045110702515, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5691, + "step": 14973 + }, + { + "epoch": 0.19918158194302935, + "grad_norm": 1.0350494384765625, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5808, + "step": 15004 + }, + { + "epoch": 0.19959311413712652, + "grad_norm": 0.8743448853492737, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5694, + "step": 15035 + }, + { + "epoch": 0.20000464633122367, + "grad_norm": 0.9410389065742493, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5739, + "step": 15066 + }, + { + "epoch": 0.20041617852532084, + "grad_norm": 1.0113860368728638, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5689, + "step": 15097 + }, + { + "epoch": 0.20082771071941802, + "grad_norm": 0.998852550983429, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.5619, + "step": 15128 + }, + { + "epoch": 0.2012392429135152, + "grad_norm": 0.9299794435501099, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5672, + "step": 15159 + }, + { + "epoch": 0.20165077510761234, + "grad_norm": 1.0109282732009888, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5691, + "step": 15190 + }, + { + "epoch": 0.20206230730170952, + "grad_norm": 0.8805022835731506, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5601, + "step": 15221 + }, + { + "epoch": 0.2024738394958067, + "grad_norm": 0.8976924419403076, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5669, + "step": 15252 + }, + { + "epoch": 0.20288537168990387, + "grad_norm": 0.8639585375785828, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5713, + "step": 15283 + }, + { + "epoch": 0.203296903884001, + "grad_norm": 0.9253800511360168, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5634, + "step": 15314 + }, + { + "epoch": 0.2037084360780982, + "grad_norm": 0.8547073006629944, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5705, + "step": 15345 + }, + { + "epoch": 0.20411996827219536, + "grad_norm": 0.8723642230033875, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.573, + "step": 15376 + }, + { + "epoch": 0.20453150046629254, + "grad_norm": 0.9164481163024902, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5645, + "step": 15407 + }, + { + "epoch": 0.20494303266038968, + "grad_norm": 0.9538819193840027, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5677, + "step": 15438 + }, + { + "epoch": 0.20535456485448686, + "grad_norm": 0.8995161652565002, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5683, + "step": 15469 + }, + { + "epoch": 0.20576609704858403, + "grad_norm": 0.9026926755905151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5694, + "step": 15500 + }, + { + "epoch": 0.2061776292426812, + "grad_norm": 0.9095093011856079, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5722, + "step": 15531 + }, + { + "epoch": 0.20658916143677836, + "grad_norm": 0.874626636505127, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5652, + "step": 15562 + }, + { + "epoch": 0.20700069363087553, + "grad_norm": 1.0359785556793213, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.57, + "step": 15593 + }, + { + "epoch": 0.2074122258249727, + "grad_norm": 0.9145928621292114, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5637, + "step": 15624 + }, + { + "epoch": 0.20782375801906988, + "grad_norm": 1.020246982574463, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5619, + "step": 15655 + }, + { + "epoch": 0.20823529021316703, + "grad_norm": 0.8766633868217468, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.5631, + "step": 15686 + }, + { + "epoch": 0.2086468224072642, + "grad_norm": 0.9841639399528503, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.5598, + "step": 15717 + }, + { + "epoch": 0.20905835460136138, + "grad_norm": 0.8983998894691467, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5723, + "step": 15748 + }, + { + "epoch": 0.20946988679545855, + "grad_norm": 0.8868324756622314, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5801, + "step": 15779 + }, + { + "epoch": 0.2098814189895557, + "grad_norm": 0.9000539183616638, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5672, + "step": 15810 + }, + { + "epoch": 0.21029295118365288, + "grad_norm": 0.9193928837776184, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.5658, + "step": 15841 + }, + { + "epoch": 0.21070448337775005, + "grad_norm": 0.9424473643302917, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5685, + "step": 15872 + }, + { + "epoch": 0.21111601557184723, + "grad_norm": 0.9552715420722961, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5725, + "step": 15903 + }, + { + "epoch": 0.21152754776594437, + "grad_norm": 0.8888420462608337, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5649, + "step": 15934 + }, + { + "epoch": 0.21193907996004155, + "grad_norm": 0.906830370426178, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5692, + "step": 15965 + }, + { + "epoch": 0.21235061215413872, + "grad_norm": 0.8939186334609985, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5665, + "step": 15996 + }, + { + "epoch": 0.2127621443482359, + "grad_norm": 1.0149410963058472, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5632, + "step": 16027 + }, + { + "epoch": 0.21317367654233305, + "grad_norm": 0.963056206703186, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5656, + "step": 16058 + }, + { + "epoch": 0.21358520873643022, + "grad_norm": 0.8071532249450684, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5667, + "step": 16089 + }, + { + "epoch": 0.2139967409305274, + "grad_norm": 0.9192640781402588, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5709, + "step": 16120 + }, + { + "epoch": 0.21440827312462457, + "grad_norm": 0.84633868932724, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5658, + "step": 16151 + }, + { + "epoch": 0.21481980531872172, + "grad_norm": 0.8883370757102966, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.5683, + "step": 16182 + }, + { + "epoch": 0.2152313375128189, + "grad_norm": 0.8919095396995544, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.568, + "step": 16213 + }, + { + "epoch": 0.21564286970691607, + "grad_norm": 0.9360633492469788, + "learning_rate": 2.439728136286796e-05, + "loss": 0.565, + "step": 16244 + }, + { + "epoch": 0.21605440190101324, + "grad_norm": 0.9496976733207703, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5627, + "step": 16275 + }, + { + "epoch": 0.2164659340951104, + "grad_norm": 0.9771477580070496, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5614, + "step": 16306 + }, + { + "epoch": 0.21687746628920757, + "grad_norm": 0.931249737739563, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5649, + "step": 16337 + }, + { + "epoch": 0.21728899848330474, + "grad_norm": 0.9592285752296448, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5646, + "step": 16368 + }, + { + "epoch": 0.21770053067740192, + "grad_norm": 0.9159988164901733, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5672, + "step": 16399 + }, + { + "epoch": 0.21811206287149906, + "grad_norm": 0.97376549243927, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5607, + "step": 16430 + }, + { + "epoch": 0.21852359506559624, + "grad_norm": 0.8469638824462891, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5597, + "step": 16461 + }, + { + "epoch": 0.2189351272596934, + "grad_norm": 1.030610203742981, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5625, + "step": 16492 + }, + { + "epoch": 0.2193466594537906, + "grad_norm": 0.9524822235107422, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5621, + "step": 16523 + }, + { + "epoch": 0.21975819164788774, + "grad_norm": 0.9608604311943054, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5672, + "step": 16554 + }, + { + "epoch": 0.2201697238419849, + "grad_norm": 0.9253712296485901, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5609, + "step": 16585 + }, + { + "epoch": 0.22058125603608209, + "grad_norm": 0.923402726650238, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5654, + "step": 16616 + }, + { + "epoch": 0.22099278823017926, + "grad_norm": 1.0030287504196167, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5576, + "step": 16647 + }, + { + "epoch": 0.2214043204242764, + "grad_norm": 0.9106613993644714, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5653, + "step": 16678 + }, + { + "epoch": 0.22181585261837358, + "grad_norm": 1.0058101415634155, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5749, + "step": 16709 + }, + { + "epoch": 0.22222738481247076, + "grad_norm": 0.931086540222168, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5637, + "step": 16740 + }, + { + "epoch": 0.22263891700656793, + "grad_norm": 0.9743716716766357, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5718, + "step": 16771 + }, + { + "epoch": 0.22305044920066508, + "grad_norm": 0.8751611709594727, + "learning_rate": 2.288805948824212e-05, + "loss": 0.5581, + "step": 16802 + }, + { + "epoch": 0.22346198139476225, + "grad_norm": 0.867038905620575, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5664, + "step": 16833 + }, + { + "epoch": 0.22387351358885943, + "grad_norm": 0.8663344383239746, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.559, + "step": 16864 + }, + { + "epoch": 0.2242850457829566, + "grad_norm": 0.984854519367218, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5626, + "step": 16895 + }, + { + "epoch": 0.22469657797705375, + "grad_norm": 0.9031103849411011, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5666, + "step": 16926 + }, + { + "epoch": 0.22510811017115093, + "grad_norm": 0.8782587647438049, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5501, + "step": 16957 + }, + { + "epoch": 0.2255196423652481, + "grad_norm": 1.0644887685775757, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.5604, + "step": 16988 + }, + { + "epoch": 0.22593117455934528, + "grad_norm": 0.8691216111183167, + "learning_rate": 2.230292185905114e-05, + "loss": 0.5649, + "step": 17019 + }, + { + "epoch": 0.22634270675344242, + "grad_norm": 0.9518167972564697, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.5598, + "step": 17050 + }, + { + "epoch": 0.2267542389475396, + "grad_norm": 0.889673113822937, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5626, + "step": 17081 + }, + { + "epoch": 0.22716577114163677, + "grad_norm": 0.9073772430419922, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5607, + "step": 17112 + }, + { + "epoch": 0.22757730333573395, + "grad_norm": 0.9674621820449829, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5651, + "step": 17143 + }, + { + "epoch": 0.2279888355298311, + "grad_norm": 0.8547524809837341, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5636, + "step": 17174 + }, + { + "epoch": 0.22840036772392827, + "grad_norm": 1.00649893283844, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5597, + "step": 17205 + }, + { + "epoch": 0.22881189991802545, + "grad_norm": 0.9329107999801636, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5705, + "step": 17236 + }, + { + "epoch": 0.22922343211212262, + "grad_norm": 1.0364869832992554, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.56, + "step": 17267 + }, + { + "epoch": 0.22963496430621977, + "grad_norm": 0.898383617401123, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5634, + "step": 17298 + }, + { + "epoch": 0.23004649650031694, + "grad_norm": 0.903266429901123, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5692, + "step": 17329 + }, + { + "epoch": 0.23045802869441412, + "grad_norm": 0.835216224193573, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5619, + "step": 17360 + }, + { + "epoch": 0.2308695608885113, + "grad_norm": 0.9033771753311157, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5661, + "step": 17391 + }, + { + "epoch": 0.23128109308260844, + "grad_norm": 0.8425393104553223, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5607, + "step": 17422 + }, + { + "epoch": 0.23169262527670562, + "grad_norm": 0.8765662908554077, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5654, + "step": 17453 + }, + { + "epoch": 0.2321041574708028, + "grad_norm": 0.8663944602012634, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5644, + "step": 17484 + }, + { + "epoch": 0.23251568966489997, + "grad_norm": 0.9837983846664429, + "learning_rate": 2.097158366805287e-05, + "loss": 0.5665, + "step": 17515 + }, + { + "epoch": 0.2329272218589971, + "grad_norm": 0.9082325100898743, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5671, + "step": 17546 + }, + { + "epoch": 0.2333387540530943, + "grad_norm": 0.9680993556976318, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5602, + "step": 17577 + }, + { + "epoch": 0.23375028624719146, + "grad_norm": 0.9881089925765991, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5682, + "step": 17608 + }, + { + "epoch": 0.23416181844128864, + "grad_norm": 0.8630657196044922, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5646, + "step": 17639 + }, + { + "epoch": 0.23457335063538579, + "grad_norm": 0.8421202301979065, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.564, + "step": 17670 + }, + { + "epoch": 0.23498488282948296, + "grad_norm": 0.8951789736747742, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5594, + "step": 17701 + }, + { + "epoch": 0.23539641502358014, + "grad_norm": 1.0024628639221191, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5541, + "step": 17732 + }, + { + "epoch": 0.2358079472176773, + "grad_norm": 0.8807896971702576, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5605, + "step": 17763 + }, + { + "epoch": 0.23621947941177446, + "grad_norm": 0.9723889827728271, + "learning_rate": 2.022757379528727e-05, + "loss": 0.559, + "step": 17794 + }, + { + "epoch": 0.23663101160587163, + "grad_norm": 0.9422227740287781, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5594, + "step": 17825 + }, + { + "epoch": 0.2370425437999688, + "grad_norm": 0.9309141039848328, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5621, + "step": 17856 + }, + { + "epoch": 0.23745407599406598, + "grad_norm": 0.8761610388755798, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5665, + "step": 17887 + }, + { + "epoch": 0.23786560818816313, + "grad_norm": 0.8991973400115967, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5726, + "step": 17918 + }, + { + "epoch": 0.2382771403822603, + "grad_norm": 0.8879802227020264, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5598, + "step": 17949 + }, + { + "epoch": 0.23868867257635748, + "grad_norm": 0.9235663414001465, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5637, + "step": 17980 + }, + { + "epoch": 0.23910020477045466, + "grad_norm": 0.9140569567680359, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5571, + "step": 18011 + }, + { + "epoch": 0.2395117369645518, + "grad_norm": 0.933430016040802, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.561, + "step": 18042 + }, + { + "epoch": 0.23992326915864898, + "grad_norm": 0.838374674320221, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5634, + "step": 18073 + }, + { + "epoch": 0.24033480135274615, + "grad_norm": 0.9295237064361572, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5604, + "step": 18104 + }, + { + "epoch": 0.24074633354684333, + "grad_norm": 0.880237340927124, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5609, + "step": 18135 + }, + { + "epoch": 0.24115786574094047, + "grad_norm": 0.9782423973083496, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5479, + "step": 18166 + }, + { + "epoch": 0.24156939793503765, + "grad_norm": 0.97150719165802, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5624, + "step": 18197 + }, + { + "epoch": 0.24198093012913482, + "grad_norm": 0.9634605050086975, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5661, + "step": 18228 + }, + { + "epoch": 0.242392462323232, + "grad_norm": 0.8706396222114563, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5582, + "step": 18259 + }, + { + "epoch": 0.24280399451732915, + "grad_norm": 0.9348079562187195, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5661, + "step": 18290 + }, + { + "epoch": 0.24321552671142632, + "grad_norm": 0.8249440789222717, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5589, + "step": 18321 + }, + { + "epoch": 0.2436270589055235, + "grad_norm": 0.9206597208976746, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5583, + "step": 18352 + }, + { + "epoch": 0.24403859109962067, + "grad_norm": 0.8377333879470825, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5571, + "step": 18383 + }, + { + "epoch": 0.24445012329371782, + "grad_norm": 0.9113277792930603, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5633, + "step": 18414 + }, + { + "epoch": 0.244861655487815, + "grad_norm": 0.9409834742546082, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5588, + "step": 18445 + }, + { + "epoch": 0.24527318768191217, + "grad_norm": 0.9693152904510498, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5621, + "step": 18476 + }, + { + "epoch": 0.24568471987600934, + "grad_norm": 0.9358701705932617, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5654, + "step": 18507 + }, + { + "epoch": 0.2460962520701065, + "grad_norm": 0.9669011831283569, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5605, + "step": 18538 + }, + { + "epoch": 0.24650778426420367, + "grad_norm": 0.9862536191940308, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5625, + "step": 18569 + }, + { + "epoch": 0.24691931645830084, + "grad_norm": 1.069492220878601, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5632, + "step": 18600 + }, + { + "epoch": 0.24733084865239802, + "grad_norm": 0.9141196608543396, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5599, + "step": 18631 + }, + { + "epoch": 0.24774238084649516, + "grad_norm": 0.8525174856185913, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5598, + "step": 18662 + }, + { + "epoch": 0.24815391304059234, + "grad_norm": 0.9469859600067139, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5554, + "step": 18693 + }, + { + "epoch": 0.2485654452346895, + "grad_norm": 0.9280170202255249, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5626, + "step": 18724 + }, + { + "epoch": 0.2489769774287867, + "grad_norm": 0.868431806564331, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5636, + "step": 18755 + }, + { + "epoch": 0.24938850962288384, + "grad_norm": 0.9638091921806335, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5561, + "step": 18786 + }, + { + "epoch": 0.249800041816981, + "grad_norm": 0.9236721396446228, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5645, + "step": 18817 + }, + { + "epoch": 0.2502115740110782, + "grad_norm": 0.8757562041282654, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5549, + "step": 18848 + }, + { + "epoch": 0.25062310620517536, + "grad_norm": 0.9709885120391846, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5613, + "step": 18879 + }, + { + "epoch": 0.25103463839927254, + "grad_norm": 0.9142551422119141, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5452, + "step": 18910 + }, + { + "epoch": 0.2514461705933697, + "grad_norm": 1.4749113321304321, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.556, + "step": 18941 + }, + { + "epoch": 0.25185770278746683, + "grad_norm": 0.8948887586593628, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5551, + "step": 18972 + }, + { + "epoch": 0.252269234981564, + "grad_norm": 0.8812825679779053, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.5549, + "step": 19003 + }, + { + "epoch": 0.2526807671756612, + "grad_norm": 0.8759215474128723, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5517, + "step": 19034 + }, + { + "epoch": 0.25309229936975836, + "grad_norm": 0.8355596661567688, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5651, + "step": 19065 + }, + { + "epoch": 0.25350383156385553, + "grad_norm": 0.9597409963607788, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5636, + "step": 19096 + }, + { + "epoch": 0.2539153637579527, + "grad_norm": 0.9418185949325562, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5551, + "step": 19127 + }, + { + "epoch": 0.2543268959520499, + "grad_norm": 0.9069491028785706, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5628, + "step": 19158 + }, + { + "epoch": 0.25473842814614706, + "grad_norm": 0.8908203840255737, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.561, + "step": 19189 + }, + { + "epoch": 0.2551499603402442, + "grad_norm": 0.8831518888473511, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5595, + "step": 19220 + }, + { + "epoch": 0.25556149253434135, + "grad_norm": 1.0363459587097168, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5555, + "step": 19251 + }, + { + "epoch": 0.2559730247284385, + "grad_norm": 0.8746747970581055, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5581, + "step": 19282 + }, + { + "epoch": 0.2563845569225357, + "grad_norm": 0.7980934381484985, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5537, + "step": 19313 + }, + { + "epoch": 0.2567960891166329, + "grad_norm": 0.851966142654419, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5513, + "step": 19344 + }, + { + "epoch": 0.25720762131073005, + "grad_norm": 0.9124501347541809, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5549, + "step": 19375 + }, + { + "epoch": 0.2576191535048272, + "grad_norm": 1.0416783094406128, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5596, + "step": 19406 + }, + { + "epoch": 0.2580306856989244, + "grad_norm": 0.9024292826652527, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5506, + "step": 19437 + }, + { + "epoch": 0.2584422178930215, + "grad_norm": 0.9234741926193237, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5549, + "step": 19468 + }, + { + "epoch": 0.2588537500871187, + "grad_norm": 0.8676049113273621, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5563, + "step": 19499 + }, + { + "epoch": 0.25926528228121587, + "grad_norm": 0.9481212496757507, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5578, + "step": 19530 + }, + { + "epoch": 0.25967681447531304, + "grad_norm": 0.8709908723831177, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5568, + "step": 19561 + }, + { + "epoch": 0.2600883466694102, + "grad_norm": 0.938412606716156, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5568, + "step": 19592 + }, + { + "epoch": 0.2604998788635074, + "grad_norm": 0.8912078142166138, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.552, + "step": 19623 + }, + { + "epoch": 0.26091141105760457, + "grad_norm": 1.1832647323608398, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5581, + "step": 19654 + }, + { + "epoch": 0.26132294325170174, + "grad_norm": 0.9237463474273682, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5592, + "step": 19685 + }, + { + "epoch": 0.26173447544579886, + "grad_norm": 0.878738522529602, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5557, + "step": 19716 + }, + { + "epoch": 0.26214600763989604, + "grad_norm": 0.9652629494667053, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5542, + "step": 19747 + }, + { + "epoch": 0.2625575398339932, + "grad_norm": 0.9157405495643616, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5609, + "step": 19778 + }, + { + "epoch": 0.2629690720280904, + "grad_norm": 0.840957760810852, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5521, + "step": 19809 + }, + { + "epoch": 0.26338060422218756, + "grad_norm": 0.8824605941772461, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5539, + "step": 19840 + }, + { + "epoch": 0.26379213641628474, + "grad_norm": 0.9319818615913391, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.559, + "step": 19871 + }, + { + "epoch": 0.2642036686103819, + "grad_norm": 0.8822436332702637, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5565, + "step": 19902 + }, + { + "epoch": 0.2646152008044791, + "grad_norm": 0.8802869915962219, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.5603, + "step": 19933 + }, + { + "epoch": 0.2650267329985762, + "grad_norm": 0.913989245891571, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5519, + "step": 19964 + }, + { + "epoch": 0.2654382651926734, + "grad_norm": 0.8885793089866638, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.5567, + "step": 19995 + }, + { + "epoch": 0.26584979738677056, + "grad_norm": 0.8809658885002136, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5516, + "step": 20026 + }, + { + "epoch": 0.26626132958086773, + "grad_norm": 0.9053296446800232, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5573, + "step": 20057 + }, + { + "epoch": 0.2666728617749649, + "grad_norm": 0.8977755904197693, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5496, + "step": 20088 + }, + { + "epoch": 0.2670843939690621, + "grad_norm": 0.935563325881958, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.558, + "step": 20119 + }, + { + "epoch": 0.26749592616315926, + "grad_norm": 1.0321307182312012, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.557, + "step": 20150 + }, + { + "epoch": 0.26790745835725643, + "grad_norm": 0.8926151990890503, + "learning_rate": 1.410916653306954e-05, + "loss": 0.556, + "step": 20181 + }, + { + "epoch": 0.26831899055135355, + "grad_norm": 0.9870996475219727, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5522, + "step": 20212 + }, + { + "epoch": 0.26873052274545073, + "grad_norm": 0.8782408237457275, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.548, + "step": 20243 + }, + { + "epoch": 0.2691420549395479, + "grad_norm": 0.887537956237793, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5548, + "step": 20274 + }, + { + "epoch": 0.2695535871336451, + "grad_norm": 0.9209414720535278, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5531, + "step": 20305 + }, + { + "epoch": 0.26996511932774225, + "grad_norm": 0.8398643732070923, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5582, + "step": 20336 + }, + { + "epoch": 0.27037665152183943, + "grad_norm": 0.9261983036994934, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5534, + "step": 20367 + }, + { + "epoch": 0.2707881837159366, + "grad_norm": 0.9387017488479614, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5536, + "step": 20398 + }, + { + "epoch": 0.2711997159100338, + "grad_norm": 0.9599831700325012, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.552, + "step": 20429 + }, + { + "epoch": 0.2716112481041309, + "grad_norm": 0.8976027965545654, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5582, + "step": 20460 + }, + { + "epoch": 0.2720227802982281, + "grad_norm": 0.890676736831665, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5597, + "step": 20491 + }, + { + "epoch": 0.27243431249232525, + "grad_norm": 0.8950179219245911, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5526, + "step": 20522 + }, + { + "epoch": 0.2728458446864224, + "grad_norm": 0.9863470792770386, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5497, + "step": 20553 + }, + { + "epoch": 0.2732573768805196, + "grad_norm": 0.9474931359291077, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5506, + "step": 20584 + }, + { + "epoch": 0.2736689090746168, + "grad_norm": 0.9262164831161499, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5517, + "step": 20615 + }, + { + "epoch": 0.27408044126871395, + "grad_norm": 0.8490736484527588, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.546, + "step": 20646 + }, + { + "epoch": 0.2744919734628111, + "grad_norm": 0.8594829440116882, + "learning_rate": 1.291596270869846e-05, + "loss": 0.554, + "step": 20677 + }, + { + "epoch": 0.27490350565690824, + "grad_norm": 0.8383352756500244, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5513, + "step": 20708 + }, + { + "epoch": 0.2753150378510054, + "grad_norm": 0.8765247464179993, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5462, + "step": 20739 + }, + { + "epoch": 0.2757265700451026, + "grad_norm": 0.856604814529419, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.562, + "step": 20770 + }, + { + "epoch": 0.27613810223919977, + "grad_norm": 0.8549590706825256, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5518, + "step": 20801 + }, + { + "epoch": 0.27654963443329694, + "grad_norm": 0.9898308515548706, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5499, + "step": 20832 + }, + { + "epoch": 0.2769611666273941, + "grad_norm": 0.8919757604598999, + "learning_rate": 1.247732733176724e-05, + "loss": 0.55, + "step": 20863 + }, + { + "epoch": 0.2773726988214913, + "grad_norm": 0.8670758008956909, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5549, + "step": 20894 + }, + { + "epoch": 0.27778423101558847, + "grad_norm": 0.822809636592865, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.5445, + "step": 20925 + }, + { + "epoch": 0.2781957632096856, + "grad_norm": 0.8837505578994751, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5543, + "step": 20956 + }, + { + "epoch": 0.27860729540378276, + "grad_norm": 0.8370216488838196, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5463, + "step": 20987 + }, + { + "epoch": 0.27901882759787994, + "grad_norm": 0.8596381545066833, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.556, + "step": 21018 + }, + { + "epoch": 0.2794303597919771, + "grad_norm": 0.9435930848121643, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.5539, + "step": 21049 + }, + { + "epoch": 0.2798418919860743, + "grad_norm": 0.8696517944335938, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5573, + "step": 21080 + }, + { + "epoch": 0.28025342418017146, + "grad_norm": 0.9277540445327759, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5557, + "step": 21111 + }, + { + "epoch": 0.28066495637426864, + "grad_norm": 0.8744814395904541, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5568, + "step": 21142 + }, + { + "epoch": 0.2810764885683658, + "grad_norm": 1.0164190530776978, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5433, + "step": 21173 + }, + { + "epoch": 0.28148802076246293, + "grad_norm": 0.8906095623970032, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.5493, + "step": 21204 + }, + { + "epoch": 0.2818995529565601, + "grad_norm": 0.8932943940162659, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.551, + "step": 21235 + }, + { + "epoch": 0.2823110851506573, + "grad_norm": 0.9328072667121887, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5543, + "step": 21266 + }, + { + "epoch": 0.28272261734475446, + "grad_norm": 0.8685097694396973, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5618, + "step": 21297 + }, + { + "epoch": 0.28313414953885163, + "grad_norm": 0.8566640615463257, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.5512, + "step": 21328 + }, + { + "epoch": 0.2835456817329488, + "grad_norm": 0.8968601226806641, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5429, + "step": 21359 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5761592551926137e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-21364/training_args.bin b/checkpoint-21364/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-21364/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-24416/config.json b/checkpoint-24416/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-24416/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-24416/generation_config.json b/checkpoint-24416/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-24416/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-24416/model-00001-of-00007.safetensors b/checkpoint-24416/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cdbed4d3239a68ea0b209e38eb26ae033aa9ba7d --- /dev/null +++ b/checkpoint-24416/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1511b3d4cc5e4615e892556c13be0bbf0d9e86d9ef5549fd8049ad7ff3ede008 +size 4886466168 diff --git a/checkpoint-24416/model-00002-of-00007.safetensors b/checkpoint-24416/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-24416/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-24416/model-00003-of-00007.safetensors b/checkpoint-24416/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-24416/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-24416/model-00004-of-00007.safetensors b/checkpoint-24416/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-24416/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-24416/model-00005-of-00007.safetensors b/checkpoint-24416/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-24416/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-24416/model-00006-of-00007.safetensors b/checkpoint-24416/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d587b6f58acef2c065ed481c8663726c23f1582 --- /dev/null +++ b/checkpoint-24416/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:054a698052f9f725f89503e99e7e756932e103360fff352d91dd515bf36ac461 +size 4999813120 diff --git a/checkpoint-24416/model-00007-of-00007.safetensors b/checkpoint-24416/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23fa8af7fb34b78f6d44be107fc1ab27a3664879 --- /dev/null +++ b/checkpoint-24416/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a8da4a2f8c30c2c33f490beac6c46601b5e08b72d40d62ba649404f52def6c +size 2571158184 diff --git a/checkpoint-24416/model.safetensors.index.json b/checkpoint-24416/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-24416/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-24416/optimizer.pt b/checkpoint-24416/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c351bd822387c3d0b06340a3771626b6a1d03f1 --- /dev/null +++ b/checkpoint-24416/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b3a5b6bf00b472a0ae777a9ec727d31ad77720b511d537929c88ef8d052cb6 +size 15385036334 diff --git a/checkpoint-24416/rng_state.pth b/checkpoint-24416/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-24416/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-24416/scheduler.pt b/checkpoint-24416/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c42ab2446b20c095538f06fcf92f01ac58007a07 --- /dev/null +++ b/checkpoint-24416/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719f421c0e2563868e52a38d7c300a4ceee2dbf15648505f514dae6bb8a5e723 +size 1064 diff --git a/checkpoint-24416/trainer_state.json b/checkpoint-24416/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5079f732a3b7f70c84b950dc5ffbd7fcd76f53a8 --- /dev/null +++ b/checkpoint-24416/trainer_state.json @@ -0,0 +1,5542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3241280661637566, + "eval_steps": 500, + "global_step": 24416, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + }, + { + "epoch": 0.1621436844742842, + "grad_norm": 0.952920138835907, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5843, + "step": 12214 + }, + { + "epoch": 0.16255521666838138, + "grad_norm": 0.9184344410896301, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5748, + "step": 12245 + }, + { + "epoch": 0.16296674886247856, + "grad_norm": 0.9797667264938354, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5788, + "step": 12276 + }, + { + "epoch": 0.16337828105657573, + "grad_norm": 0.968781054019928, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5835, + "step": 12307 + }, + { + "epoch": 0.16378981325067288, + "grad_norm": 1.0632243156433105, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5742, + "step": 12338 + }, + { + "epoch": 0.16420134544477005, + "grad_norm": 0.9538365006446838, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5737, + "step": 12369 + }, + { + "epoch": 0.16461287763886723, + "grad_norm": 0.9336950778961182, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5821, + "step": 12400 + }, + { + "epoch": 0.1650244098329644, + "grad_norm": 0.9611359238624573, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5847, + "step": 12431 + }, + { + "epoch": 0.16543594202706155, + "grad_norm": 1.0813709497451782, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5805, + "step": 12462 + }, + { + "epoch": 0.16584747422115872, + "grad_norm": 0.9780453443527222, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5761, + "step": 12493 + }, + { + "epoch": 0.1662590064152559, + "grad_norm": 0.9629074335098267, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.58, + "step": 12524 + }, + { + "epoch": 0.16667053860935307, + "grad_norm": 0.9241612553596497, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5732, + "step": 12555 + }, + { + "epoch": 0.16708207080345022, + "grad_norm": 1.0314536094665527, + "learning_rate": 3.409364314116074e-05, + "loss": 0.578, + "step": 12586 + }, + { + "epoch": 0.1674936029975474, + "grad_norm": 0.9907404184341431, + "learning_rate": 3.401536249920559e-05, + "loss": 0.591, + "step": 12617 + }, + { + "epoch": 0.16790513519164457, + "grad_norm": 0.9442338943481445, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5754, + "step": 12648 + }, + { + "epoch": 0.16831666738574175, + "grad_norm": 0.9101460576057434, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5815, + "step": 12679 + }, + { + "epoch": 0.1687281995798389, + "grad_norm": 1.029105544090271, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5766, + "step": 12710 + }, + { + "epoch": 0.16913973177393607, + "grad_norm": 1.028542160987854, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5769, + "step": 12741 + }, + { + "epoch": 0.16955126396803324, + "grad_norm": 0.9411474466323853, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5808, + "step": 12772 + }, + { + "epoch": 0.16996279616213042, + "grad_norm": 0.9260859489440918, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5683, + "step": 12803 + }, + { + "epoch": 0.17037432835622757, + "grad_norm": 0.9867289066314697, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5781, + "step": 12834 + }, + { + "epoch": 0.17078586055032474, + "grad_norm": 0.9310885071754456, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5876, + "step": 12865 + }, + { + "epoch": 0.17119739274442192, + "grad_norm": 1.1955841779708862, + "learning_rate": 3.330636493090868e-05, + "loss": 0.577, + "step": 12896 + }, + { + "epoch": 0.1716089249385191, + "grad_norm": 1.0715487003326416, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5835, + "step": 12927 + }, + { + "epoch": 0.17202045713261624, + "grad_norm": 0.9181262850761414, + "learning_rate": 3.314775593945251e-05, + "loss": 0.5733, + "step": 12958 + }, + { + "epoch": 0.17243198932671341, + "grad_norm": 0.9595162868499756, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5685, + "step": 12989 + }, + { + "epoch": 0.1728435215208106, + "grad_norm": 1.057614803314209, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5736, + "step": 13020 + }, + { + "epoch": 0.17325505371490776, + "grad_norm": 0.9654355049133301, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5782, + "step": 13051 + }, + { + "epoch": 0.1736665859090049, + "grad_norm": 1.0919837951660156, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5771, + "step": 13082 + }, + { + "epoch": 0.1740781181031021, + "grad_norm": 1.064331293106079, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5731, + "step": 13113 + }, + { + "epoch": 0.17448965029719926, + "grad_norm": 0.8971393704414368, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5855, + "step": 13144 + }, + { + "epoch": 0.17490118249129644, + "grad_norm": 0.9515472054481506, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5776, + "step": 13175 + }, + { + "epoch": 0.17531271468539358, + "grad_norm": 0.9811455607414246, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5817, + "step": 13206 + }, + { + "epoch": 0.17572424687949076, + "grad_norm": 0.9187535643577576, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5751, + "step": 13237 + }, + { + "epoch": 0.17613577907358793, + "grad_norm": 0.9396876096725464, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5744, + "step": 13268 + }, + { + "epoch": 0.1765473112676851, + "grad_norm": 1.064921259880066, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5731, + "step": 13299 + }, + { + "epoch": 0.17695884346178226, + "grad_norm": 0.946753978729248, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5824, + "step": 13330 + }, + { + "epoch": 0.17737037565587943, + "grad_norm": 0.8623449206352234, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5764, + "step": 13361 + }, + { + "epoch": 0.1777819078499766, + "grad_norm": 0.9526584148406982, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5696, + "step": 13392 + }, + { + "epoch": 0.17819344004407378, + "grad_norm": 0.9413353800773621, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5688, + "step": 13423 + }, + { + "epoch": 0.17860497223817093, + "grad_norm": 0.9612322449684143, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5694, + "step": 13454 + }, + { + "epoch": 0.1790165044322681, + "grad_norm": 1.0289851427078247, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5799, + "step": 13485 + }, + { + "epoch": 0.17942803662636528, + "grad_norm": 0.9124505519866943, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5729, + "step": 13516 + }, + { + "epoch": 0.17983956882046245, + "grad_norm": 0.9874391555786133, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5761, + "step": 13547 + }, + { + "epoch": 0.1802511010145596, + "grad_norm": 1.0058220624923706, + "learning_rate": 3.154262717052985e-05, + "loss": 0.58, + "step": 13578 + }, + { + "epoch": 0.18066263320865678, + "grad_norm": 0.9753491282463074, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5749, + "step": 13609 + }, + { + "epoch": 0.18107416540275395, + "grad_norm": 1.0167535543441772, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5848, + "step": 13640 + }, + { + "epoch": 0.18148569759685113, + "grad_norm": 0.9970638155937195, + "learning_rate": 3.129913267924946e-05, + "loss": 0.5743, + "step": 13671 + }, + { + "epoch": 0.18189722979094827, + "grad_norm": 1.0033198595046997, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5747, + "step": 13702 + }, + { + "epoch": 0.18230876198504545, + "grad_norm": 1.0107535123825073, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5725, + "step": 13733 + }, + { + "epoch": 0.18272029417914262, + "grad_norm": 0.9341425895690918, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5726, + "step": 13764 + }, + { + "epoch": 0.1831318263732398, + "grad_norm": 1.0088342428207397, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5737, + "step": 13795 + }, + { + "epoch": 0.18354335856733694, + "grad_norm": 0.8871366381645203, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.576, + "step": 13826 + }, + { + "epoch": 0.18395489076143412, + "grad_norm": 0.971552848815918, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.57, + "step": 13857 + }, + { + "epoch": 0.1843664229555313, + "grad_norm": 1.1167266368865967, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5652, + "step": 13888 + }, + { + "epoch": 0.18477795514962847, + "grad_norm": 0.9768183827400208, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5671, + "step": 13919 + }, + { + "epoch": 0.18518948734372562, + "grad_norm": 1.0097460746765137, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5771, + "step": 13950 + }, + { + "epoch": 0.1856010195378228, + "grad_norm": 1.010986089706421, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5723, + "step": 13981 + }, + { + "epoch": 0.18601255173191997, + "grad_norm": 0.8861789107322693, + "learning_rate": 3.040103481317539e-05, + "loss": 0.5798, + "step": 14012 + }, + { + "epoch": 0.18642408392601714, + "grad_norm": 0.9811322689056396, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5703, + "step": 14043 + }, + { + "epoch": 0.1868356161201143, + "grad_norm": 0.9183774590492249, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5655, + "step": 14074 + }, + { + "epoch": 0.18724714831421146, + "grad_norm": 0.9655874371528625, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.57, + "step": 14105 + }, + { + "epoch": 0.18765868050830864, + "grad_norm": 0.9735666513442993, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5748, + "step": 14136 + }, + { + "epoch": 0.18807021270240581, + "grad_norm": 0.9084784388542175, + "learning_rate": 2.999029669712431e-05, + "loss": 0.568, + "step": 14167 + }, + { + "epoch": 0.18848174489650296, + "grad_norm": 0.8866302371025085, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5768, + "step": 14198 + }, + { + "epoch": 0.18889327709060014, + "grad_norm": 0.9936773180961609, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5701, + "step": 14229 + }, + { + "epoch": 0.1893048092846973, + "grad_norm": 0.9721146821975708, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5638, + "step": 14260 + }, + { + "epoch": 0.1897163414787945, + "grad_norm": 0.921503484249115, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5645, + "step": 14291 + }, + { + "epoch": 0.19012787367289163, + "grad_norm": 0.9194900989532471, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5724, + "step": 14322 + }, + { + "epoch": 0.1905394058669888, + "grad_norm": 0.9940860271453857, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5731, + "step": 14353 + }, + { + "epoch": 0.19095093806108598, + "grad_norm": 0.9163900017738342, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5761, + "step": 14384 + }, + { + "epoch": 0.19136247025518316, + "grad_norm": 0.960118293762207, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.572, + "step": 14415 + }, + { + "epoch": 0.1917740024492803, + "grad_norm": 0.9942934513092041, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5606, + "step": 14446 + }, + { + "epoch": 0.19218553464337748, + "grad_norm": 0.9903921484947205, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5704, + "step": 14477 + }, + { + "epoch": 0.19259706683747466, + "grad_norm": 0.9734652638435364, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.5768, + "step": 14508 + }, + { + "epoch": 0.19300859903157183, + "grad_norm": 0.9251719117164612, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.565, + "step": 14539 + }, + { + "epoch": 0.19342013122566898, + "grad_norm": 0.9734142422676086, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5702, + "step": 14570 + }, + { + "epoch": 0.19383166341976615, + "grad_norm": 0.9692136645317078, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5581, + "step": 14601 + }, + { + "epoch": 0.19424319561386333, + "grad_norm": 0.9654501676559448, + "learning_rate": 2.875010077160754e-05, + "loss": 0.5667, + "step": 14632 + }, + { + "epoch": 0.1946547278079605, + "grad_norm": 1.0234603881835938, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5688, + "step": 14663 + }, + { + "epoch": 0.19506626000205765, + "grad_norm": 0.9619661569595337, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5713, + "step": 14694 + }, + { + "epoch": 0.19547779219615483, + "grad_norm": 1.0468534231185913, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5702, + "step": 14725 + }, + { + "epoch": 0.195889324390252, + "grad_norm": 0.9849717617034912, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5695, + "step": 14756 + }, + { + "epoch": 0.19630085658434918, + "grad_norm": 0.8887643218040466, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5688, + "step": 14787 + }, + { + "epoch": 0.19671238877844632, + "grad_norm": 0.9583494067192078, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5775, + "step": 14818 + }, + { + "epoch": 0.1971239209725435, + "grad_norm": 0.9766852259635925, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5688, + "step": 14849 + }, + { + "epoch": 0.19753545316664067, + "grad_norm": 0.9450570940971375, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5773, + "step": 14880 + }, + { + "epoch": 0.19794698536073785, + "grad_norm": 0.9476996064186096, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5688, + "step": 14911 + }, + { + "epoch": 0.198358517554835, + "grad_norm": 1.0239835977554321, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5717, + "step": 14942 + }, + { + "epoch": 0.19877004974893217, + "grad_norm": 0.9848045110702515, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5691, + "step": 14973 + }, + { + "epoch": 0.19918158194302935, + "grad_norm": 1.0350494384765625, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5808, + "step": 15004 + }, + { + "epoch": 0.19959311413712652, + "grad_norm": 0.8743448853492737, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5694, + "step": 15035 + }, + { + "epoch": 0.20000464633122367, + "grad_norm": 0.9410389065742493, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5739, + "step": 15066 + }, + { + "epoch": 0.20041617852532084, + "grad_norm": 1.0113860368728638, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5689, + "step": 15097 + }, + { + "epoch": 0.20082771071941802, + "grad_norm": 0.998852550983429, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.5619, + "step": 15128 + }, + { + "epoch": 0.2012392429135152, + "grad_norm": 0.9299794435501099, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5672, + "step": 15159 + }, + { + "epoch": 0.20165077510761234, + "grad_norm": 1.0109282732009888, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5691, + "step": 15190 + }, + { + "epoch": 0.20206230730170952, + "grad_norm": 0.8805022835731506, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5601, + "step": 15221 + }, + { + "epoch": 0.2024738394958067, + "grad_norm": 0.8976924419403076, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5669, + "step": 15252 + }, + { + "epoch": 0.20288537168990387, + "grad_norm": 0.8639585375785828, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5713, + "step": 15283 + }, + { + "epoch": 0.203296903884001, + "grad_norm": 0.9253800511360168, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5634, + "step": 15314 + }, + { + "epoch": 0.2037084360780982, + "grad_norm": 0.8547073006629944, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5705, + "step": 15345 + }, + { + "epoch": 0.20411996827219536, + "grad_norm": 0.8723642230033875, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.573, + "step": 15376 + }, + { + "epoch": 0.20453150046629254, + "grad_norm": 0.9164481163024902, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5645, + "step": 15407 + }, + { + "epoch": 0.20494303266038968, + "grad_norm": 0.9538819193840027, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5677, + "step": 15438 + }, + { + "epoch": 0.20535456485448686, + "grad_norm": 0.8995161652565002, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5683, + "step": 15469 + }, + { + "epoch": 0.20576609704858403, + "grad_norm": 0.9026926755905151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5694, + "step": 15500 + }, + { + "epoch": 0.2061776292426812, + "grad_norm": 0.9095093011856079, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5722, + "step": 15531 + }, + { + "epoch": 0.20658916143677836, + "grad_norm": 0.874626636505127, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5652, + "step": 15562 + }, + { + "epoch": 0.20700069363087553, + "grad_norm": 1.0359785556793213, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.57, + "step": 15593 + }, + { + "epoch": 0.2074122258249727, + "grad_norm": 0.9145928621292114, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5637, + "step": 15624 + }, + { + "epoch": 0.20782375801906988, + "grad_norm": 1.020246982574463, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5619, + "step": 15655 + }, + { + "epoch": 0.20823529021316703, + "grad_norm": 0.8766633868217468, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.5631, + "step": 15686 + }, + { + "epoch": 0.2086468224072642, + "grad_norm": 0.9841639399528503, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.5598, + "step": 15717 + }, + { + "epoch": 0.20905835460136138, + "grad_norm": 0.8983998894691467, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5723, + "step": 15748 + }, + { + "epoch": 0.20946988679545855, + "grad_norm": 0.8868324756622314, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5801, + "step": 15779 + }, + { + "epoch": 0.2098814189895557, + "grad_norm": 0.9000539183616638, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5672, + "step": 15810 + }, + { + "epoch": 0.21029295118365288, + "grad_norm": 0.9193928837776184, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.5658, + "step": 15841 + }, + { + "epoch": 0.21070448337775005, + "grad_norm": 0.9424473643302917, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5685, + "step": 15872 + }, + { + "epoch": 0.21111601557184723, + "grad_norm": 0.9552715420722961, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5725, + "step": 15903 + }, + { + "epoch": 0.21152754776594437, + "grad_norm": 0.8888420462608337, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5649, + "step": 15934 + }, + { + "epoch": 0.21193907996004155, + "grad_norm": 0.906830370426178, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5692, + "step": 15965 + }, + { + "epoch": 0.21235061215413872, + "grad_norm": 0.8939186334609985, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5665, + "step": 15996 + }, + { + "epoch": 0.2127621443482359, + "grad_norm": 1.0149410963058472, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5632, + "step": 16027 + }, + { + "epoch": 0.21317367654233305, + "grad_norm": 0.963056206703186, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5656, + "step": 16058 + }, + { + "epoch": 0.21358520873643022, + "grad_norm": 0.8071532249450684, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5667, + "step": 16089 + }, + { + "epoch": 0.2139967409305274, + "grad_norm": 0.9192640781402588, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5709, + "step": 16120 + }, + { + "epoch": 0.21440827312462457, + "grad_norm": 0.84633868932724, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5658, + "step": 16151 + }, + { + "epoch": 0.21481980531872172, + "grad_norm": 0.8883370757102966, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.5683, + "step": 16182 + }, + { + "epoch": 0.2152313375128189, + "grad_norm": 0.8919095396995544, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.568, + "step": 16213 + }, + { + "epoch": 0.21564286970691607, + "grad_norm": 0.9360633492469788, + "learning_rate": 2.439728136286796e-05, + "loss": 0.565, + "step": 16244 + }, + { + "epoch": 0.21605440190101324, + "grad_norm": 0.9496976733207703, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5627, + "step": 16275 + }, + { + "epoch": 0.2164659340951104, + "grad_norm": 0.9771477580070496, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5614, + "step": 16306 + }, + { + "epoch": 0.21687746628920757, + "grad_norm": 0.931249737739563, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5649, + "step": 16337 + }, + { + "epoch": 0.21728899848330474, + "grad_norm": 0.9592285752296448, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5646, + "step": 16368 + }, + { + "epoch": 0.21770053067740192, + "grad_norm": 0.9159988164901733, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5672, + "step": 16399 + }, + { + "epoch": 0.21811206287149906, + "grad_norm": 0.97376549243927, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5607, + "step": 16430 + }, + { + "epoch": 0.21852359506559624, + "grad_norm": 0.8469638824462891, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5597, + "step": 16461 + }, + { + "epoch": 0.2189351272596934, + "grad_norm": 1.030610203742981, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5625, + "step": 16492 + }, + { + "epoch": 0.2193466594537906, + "grad_norm": 0.9524822235107422, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5621, + "step": 16523 + }, + { + "epoch": 0.21975819164788774, + "grad_norm": 0.9608604311943054, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5672, + "step": 16554 + }, + { + "epoch": 0.2201697238419849, + "grad_norm": 0.9253712296485901, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5609, + "step": 16585 + }, + { + "epoch": 0.22058125603608209, + "grad_norm": 0.923402726650238, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5654, + "step": 16616 + }, + { + "epoch": 0.22099278823017926, + "grad_norm": 1.0030287504196167, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5576, + "step": 16647 + }, + { + "epoch": 0.2214043204242764, + "grad_norm": 0.9106613993644714, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5653, + "step": 16678 + }, + { + "epoch": 0.22181585261837358, + "grad_norm": 1.0058101415634155, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5749, + "step": 16709 + }, + { + "epoch": 0.22222738481247076, + "grad_norm": 0.931086540222168, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5637, + "step": 16740 + }, + { + "epoch": 0.22263891700656793, + "grad_norm": 0.9743716716766357, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5718, + "step": 16771 + }, + { + "epoch": 0.22305044920066508, + "grad_norm": 0.8751611709594727, + "learning_rate": 2.288805948824212e-05, + "loss": 0.5581, + "step": 16802 + }, + { + "epoch": 0.22346198139476225, + "grad_norm": 0.867038905620575, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5664, + "step": 16833 + }, + { + "epoch": 0.22387351358885943, + "grad_norm": 0.8663344383239746, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.559, + "step": 16864 + }, + { + "epoch": 0.2242850457829566, + "grad_norm": 0.984854519367218, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5626, + "step": 16895 + }, + { + "epoch": 0.22469657797705375, + "grad_norm": 0.9031103849411011, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5666, + "step": 16926 + }, + { + "epoch": 0.22510811017115093, + "grad_norm": 0.8782587647438049, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5501, + "step": 16957 + }, + { + "epoch": 0.2255196423652481, + "grad_norm": 1.0644887685775757, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.5604, + "step": 16988 + }, + { + "epoch": 0.22593117455934528, + "grad_norm": 0.8691216111183167, + "learning_rate": 2.230292185905114e-05, + "loss": 0.5649, + "step": 17019 + }, + { + "epoch": 0.22634270675344242, + "grad_norm": 0.9518167972564697, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.5598, + "step": 17050 + }, + { + "epoch": 0.2267542389475396, + "grad_norm": 0.889673113822937, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5626, + "step": 17081 + }, + { + "epoch": 0.22716577114163677, + "grad_norm": 0.9073772430419922, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5607, + "step": 17112 + }, + { + "epoch": 0.22757730333573395, + "grad_norm": 0.9674621820449829, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5651, + "step": 17143 + }, + { + "epoch": 0.2279888355298311, + "grad_norm": 0.8547524809837341, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5636, + "step": 17174 + }, + { + "epoch": 0.22840036772392827, + "grad_norm": 1.00649893283844, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5597, + "step": 17205 + }, + { + "epoch": 0.22881189991802545, + "grad_norm": 0.9329107999801636, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5705, + "step": 17236 + }, + { + "epoch": 0.22922343211212262, + "grad_norm": 1.0364869832992554, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.56, + "step": 17267 + }, + { + "epoch": 0.22963496430621977, + "grad_norm": 0.898383617401123, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5634, + "step": 17298 + }, + { + "epoch": 0.23004649650031694, + "grad_norm": 0.903266429901123, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5692, + "step": 17329 + }, + { + "epoch": 0.23045802869441412, + "grad_norm": 0.835216224193573, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5619, + "step": 17360 + }, + { + "epoch": 0.2308695608885113, + "grad_norm": 0.9033771753311157, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5661, + "step": 17391 + }, + { + "epoch": 0.23128109308260844, + "grad_norm": 0.8425393104553223, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5607, + "step": 17422 + }, + { + "epoch": 0.23169262527670562, + "grad_norm": 0.8765662908554077, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5654, + "step": 17453 + }, + { + "epoch": 0.2321041574708028, + "grad_norm": 0.8663944602012634, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5644, + "step": 17484 + }, + { + "epoch": 0.23251568966489997, + "grad_norm": 0.9837983846664429, + "learning_rate": 2.097158366805287e-05, + "loss": 0.5665, + "step": 17515 + }, + { + "epoch": 0.2329272218589971, + "grad_norm": 0.9082325100898743, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5671, + "step": 17546 + }, + { + "epoch": 0.2333387540530943, + "grad_norm": 0.9680993556976318, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5602, + "step": 17577 + }, + { + "epoch": 0.23375028624719146, + "grad_norm": 0.9881089925765991, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5682, + "step": 17608 + }, + { + "epoch": 0.23416181844128864, + "grad_norm": 0.8630657196044922, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5646, + "step": 17639 + }, + { + "epoch": 0.23457335063538579, + "grad_norm": 0.8421202301979065, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.564, + "step": 17670 + }, + { + "epoch": 0.23498488282948296, + "grad_norm": 0.8951789736747742, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5594, + "step": 17701 + }, + { + "epoch": 0.23539641502358014, + "grad_norm": 1.0024628639221191, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5541, + "step": 17732 + }, + { + "epoch": 0.2358079472176773, + "grad_norm": 0.8807896971702576, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5605, + "step": 17763 + }, + { + "epoch": 0.23621947941177446, + "grad_norm": 0.9723889827728271, + "learning_rate": 2.022757379528727e-05, + "loss": 0.559, + "step": 17794 + }, + { + "epoch": 0.23663101160587163, + "grad_norm": 0.9422227740287781, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5594, + "step": 17825 + }, + { + "epoch": 0.2370425437999688, + "grad_norm": 0.9309141039848328, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5621, + "step": 17856 + }, + { + "epoch": 0.23745407599406598, + "grad_norm": 0.8761610388755798, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5665, + "step": 17887 + }, + { + "epoch": 0.23786560818816313, + "grad_norm": 0.8991973400115967, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5726, + "step": 17918 + }, + { + "epoch": 0.2382771403822603, + "grad_norm": 0.8879802227020264, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5598, + "step": 17949 + }, + { + "epoch": 0.23868867257635748, + "grad_norm": 0.9235663414001465, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5637, + "step": 17980 + }, + { + "epoch": 0.23910020477045466, + "grad_norm": 0.9140569567680359, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5571, + "step": 18011 + }, + { + "epoch": 0.2395117369645518, + "grad_norm": 0.933430016040802, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.561, + "step": 18042 + }, + { + "epoch": 0.23992326915864898, + "grad_norm": 0.838374674320221, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5634, + "step": 18073 + }, + { + "epoch": 0.24033480135274615, + "grad_norm": 0.9295237064361572, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5604, + "step": 18104 + }, + { + "epoch": 0.24074633354684333, + "grad_norm": 0.880237340927124, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5609, + "step": 18135 + }, + { + "epoch": 0.24115786574094047, + "grad_norm": 0.9782423973083496, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5479, + "step": 18166 + }, + { + "epoch": 0.24156939793503765, + "grad_norm": 0.97150719165802, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5624, + "step": 18197 + }, + { + "epoch": 0.24198093012913482, + "grad_norm": 0.9634605050086975, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5661, + "step": 18228 + }, + { + "epoch": 0.242392462323232, + "grad_norm": 0.8706396222114563, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5582, + "step": 18259 + }, + { + "epoch": 0.24280399451732915, + "grad_norm": 0.9348079562187195, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5661, + "step": 18290 + }, + { + "epoch": 0.24321552671142632, + "grad_norm": 0.8249440789222717, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5589, + "step": 18321 + }, + { + "epoch": 0.2436270589055235, + "grad_norm": 0.9206597208976746, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5583, + "step": 18352 + }, + { + "epoch": 0.24403859109962067, + "grad_norm": 0.8377333879470825, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5571, + "step": 18383 + }, + { + "epoch": 0.24445012329371782, + "grad_norm": 0.9113277792930603, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5633, + "step": 18414 + }, + { + "epoch": 0.244861655487815, + "grad_norm": 0.9409834742546082, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5588, + "step": 18445 + }, + { + "epoch": 0.24527318768191217, + "grad_norm": 0.9693152904510498, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5621, + "step": 18476 + }, + { + "epoch": 0.24568471987600934, + "grad_norm": 0.9358701705932617, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5654, + "step": 18507 + }, + { + "epoch": 0.2460962520701065, + "grad_norm": 0.9669011831283569, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5605, + "step": 18538 + }, + { + "epoch": 0.24650778426420367, + "grad_norm": 0.9862536191940308, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5625, + "step": 18569 + }, + { + "epoch": 0.24691931645830084, + "grad_norm": 1.069492220878601, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5632, + "step": 18600 + }, + { + "epoch": 0.24733084865239802, + "grad_norm": 0.9141196608543396, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5599, + "step": 18631 + }, + { + "epoch": 0.24774238084649516, + "grad_norm": 0.8525174856185913, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5598, + "step": 18662 + }, + { + "epoch": 0.24815391304059234, + "grad_norm": 0.9469859600067139, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5554, + "step": 18693 + }, + { + "epoch": 0.2485654452346895, + "grad_norm": 0.9280170202255249, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5626, + "step": 18724 + }, + { + "epoch": 0.2489769774287867, + "grad_norm": 0.868431806564331, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5636, + "step": 18755 + }, + { + "epoch": 0.24938850962288384, + "grad_norm": 0.9638091921806335, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5561, + "step": 18786 + }, + { + "epoch": 0.249800041816981, + "grad_norm": 0.9236721396446228, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5645, + "step": 18817 + }, + { + "epoch": 0.2502115740110782, + "grad_norm": 0.8757562041282654, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5549, + "step": 18848 + }, + { + "epoch": 0.25062310620517536, + "grad_norm": 0.9709885120391846, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5613, + "step": 18879 + }, + { + "epoch": 0.25103463839927254, + "grad_norm": 0.9142551422119141, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5452, + "step": 18910 + }, + { + "epoch": 0.2514461705933697, + "grad_norm": 1.4749113321304321, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.556, + "step": 18941 + }, + { + "epoch": 0.25185770278746683, + "grad_norm": 0.8948887586593628, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5551, + "step": 18972 + }, + { + "epoch": 0.252269234981564, + "grad_norm": 0.8812825679779053, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.5549, + "step": 19003 + }, + { + "epoch": 0.2526807671756612, + "grad_norm": 0.8759215474128723, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5517, + "step": 19034 + }, + { + "epoch": 0.25309229936975836, + "grad_norm": 0.8355596661567688, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5651, + "step": 19065 + }, + { + "epoch": 0.25350383156385553, + "grad_norm": 0.9597409963607788, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5636, + "step": 19096 + }, + { + "epoch": 0.2539153637579527, + "grad_norm": 0.9418185949325562, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5551, + "step": 19127 + }, + { + "epoch": 0.2543268959520499, + "grad_norm": 0.9069491028785706, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5628, + "step": 19158 + }, + { + "epoch": 0.25473842814614706, + "grad_norm": 0.8908203840255737, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.561, + "step": 19189 + }, + { + "epoch": 0.2551499603402442, + "grad_norm": 0.8831518888473511, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5595, + "step": 19220 + }, + { + "epoch": 0.25556149253434135, + "grad_norm": 1.0363459587097168, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5555, + "step": 19251 + }, + { + "epoch": 0.2559730247284385, + "grad_norm": 0.8746747970581055, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5581, + "step": 19282 + }, + { + "epoch": 0.2563845569225357, + "grad_norm": 0.7980934381484985, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5537, + "step": 19313 + }, + { + "epoch": 0.2567960891166329, + "grad_norm": 0.851966142654419, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5513, + "step": 19344 + }, + { + "epoch": 0.25720762131073005, + "grad_norm": 0.9124501347541809, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5549, + "step": 19375 + }, + { + "epoch": 0.2576191535048272, + "grad_norm": 1.0416783094406128, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5596, + "step": 19406 + }, + { + "epoch": 0.2580306856989244, + "grad_norm": 0.9024292826652527, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5506, + "step": 19437 + }, + { + "epoch": 0.2584422178930215, + "grad_norm": 0.9234741926193237, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5549, + "step": 19468 + }, + { + "epoch": 0.2588537500871187, + "grad_norm": 0.8676049113273621, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5563, + "step": 19499 + }, + { + "epoch": 0.25926528228121587, + "grad_norm": 0.9481212496757507, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5578, + "step": 19530 + }, + { + "epoch": 0.25967681447531304, + "grad_norm": 0.8709908723831177, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5568, + "step": 19561 + }, + { + "epoch": 0.2600883466694102, + "grad_norm": 0.938412606716156, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5568, + "step": 19592 + }, + { + "epoch": 0.2604998788635074, + "grad_norm": 0.8912078142166138, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.552, + "step": 19623 + }, + { + "epoch": 0.26091141105760457, + "grad_norm": 1.1832647323608398, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5581, + "step": 19654 + }, + { + "epoch": 0.26132294325170174, + "grad_norm": 0.9237463474273682, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5592, + "step": 19685 + }, + { + "epoch": 0.26173447544579886, + "grad_norm": 0.878738522529602, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5557, + "step": 19716 + }, + { + "epoch": 0.26214600763989604, + "grad_norm": 0.9652629494667053, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5542, + "step": 19747 + }, + { + "epoch": 0.2625575398339932, + "grad_norm": 0.9157405495643616, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5609, + "step": 19778 + }, + { + "epoch": 0.2629690720280904, + "grad_norm": 0.840957760810852, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5521, + "step": 19809 + }, + { + "epoch": 0.26338060422218756, + "grad_norm": 0.8824605941772461, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5539, + "step": 19840 + }, + { + "epoch": 0.26379213641628474, + "grad_norm": 0.9319818615913391, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.559, + "step": 19871 + }, + { + "epoch": 0.2642036686103819, + "grad_norm": 0.8822436332702637, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5565, + "step": 19902 + }, + { + "epoch": 0.2646152008044791, + "grad_norm": 0.8802869915962219, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.5603, + "step": 19933 + }, + { + "epoch": 0.2650267329985762, + "grad_norm": 0.913989245891571, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5519, + "step": 19964 + }, + { + "epoch": 0.2654382651926734, + "grad_norm": 0.8885793089866638, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.5567, + "step": 19995 + }, + { + "epoch": 0.26584979738677056, + "grad_norm": 0.8809658885002136, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5516, + "step": 20026 + }, + { + "epoch": 0.26626132958086773, + "grad_norm": 0.9053296446800232, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5573, + "step": 20057 + }, + { + "epoch": 0.2666728617749649, + "grad_norm": 0.8977755904197693, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5496, + "step": 20088 + }, + { + "epoch": 0.2670843939690621, + "grad_norm": 0.935563325881958, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.558, + "step": 20119 + }, + { + "epoch": 0.26749592616315926, + "grad_norm": 1.0321307182312012, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.557, + "step": 20150 + }, + { + "epoch": 0.26790745835725643, + "grad_norm": 0.8926151990890503, + "learning_rate": 1.410916653306954e-05, + "loss": 0.556, + "step": 20181 + }, + { + "epoch": 0.26831899055135355, + "grad_norm": 0.9870996475219727, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5522, + "step": 20212 + }, + { + "epoch": 0.26873052274545073, + "grad_norm": 0.8782408237457275, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.548, + "step": 20243 + }, + { + "epoch": 0.2691420549395479, + "grad_norm": 0.887537956237793, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5548, + "step": 20274 + }, + { + "epoch": 0.2695535871336451, + "grad_norm": 0.9209414720535278, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5531, + "step": 20305 + }, + { + "epoch": 0.26996511932774225, + "grad_norm": 0.8398643732070923, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5582, + "step": 20336 + }, + { + "epoch": 0.27037665152183943, + "grad_norm": 0.9261983036994934, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5534, + "step": 20367 + }, + { + "epoch": 0.2707881837159366, + "grad_norm": 0.9387017488479614, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5536, + "step": 20398 + }, + { + "epoch": 0.2711997159100338, + "grad_norm": 0.9599831700325012, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.552, + "step": 20429 + }, + { + "epoch": 0.2716112481041309, + "grad_norm": 0.8976027965545654, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5582, + "step": 20460 + }, + { + "epoch": 0.2720227802982281, + "grad_norm": 0.890676736831665, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5597, + "step": 20491 + }, + { + "epoch": 0.27243431249232525, + "grad_norm": 0.8950179219245911, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5526, + "step": 20522 + }, + { + "epoch": 0.2728458446864224, + "grad_norm": 0.9863470792770386, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5497, + "step": 20553 + }, + { + "epoch": 0.2732573768805196, + "grad_norm": 0.9474931359291077, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5506, + "step": 20584 + }, + { + "epoch": 0.2736689090746168, + "grad_norm": 0.9262164831161499, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5517, + "step": 20615 + }, + { + "epoch": 0.27408044126871395, + "grad_norm": 0.8490736484527588, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.546, + "step": 20646 + }, + { + "epoch": 0.2744919734628111, + "grad_norm": 0.8594829440116882, + "learning_rate": 1.291596270869846e-05, + "loss": 0.554, + "step": 20677 + }, + { + "epoch": 0.27490350565690824, + "grad_norm": 0.8383352756500244, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5513, + "step": 20708 + }, + { + "epoch": 0.2753150378510054, + "grad_norm": 0.8765247464179993, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5462, + "step": 20739 + }, + { + "epoch": 0.2757265700451026, + "grad_norm": 0.856604814529419, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.562, + "step": 20770 + }, + { + "epoch": 0.27613810223919977, + "grad_norm": 0.8549590706825256, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5518, + "step": 20801 + }, + { + "epoch": 0.27654963443329694, + "grad_norm": 0.9898308515548706, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5499, + "step": 20832 + }, + { + "epoch": 0.2769611666273941, + "grad_norm": 0.8919757604598999, + "learning_rate": 1.247732733176724e-05, + "loss": 0.55, + "step": 20863 + }, + { + "epoch": 0.2773726988214913, + "grad_norm": 0.8670758008956909, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5549, + "step": 20894 + }, + { + "epoch": 0.27778423101558847, + "grad_norm": 0.822809636592865, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.5445, + "step": 20925 + }, + { + "epoch": 0.2781957632096856, + "grad_norm": 0.8837505578994751, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5543, + "step": 20956 + }, + { + "epoch": 0.27860729540378276, + "grad_norm": 0.8370216488838196, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5463, + "step": 20987 + }, + { + "epoch": 0.27901882759787994, + "grad_norm": 0.8596381545066833, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.556, + "step": 21018 + }, + { + "epoch": 0.2794303597919771, + "grad_norm": 0.9435930848121643, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.5539, + "step": 21049 + }, + { + "epoch": 0.2798418919860743, + "grad_norm": 0.8696517944335938, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5573, + "step": 21080 + }, + { + "epoch": 0.28025342418017146, + "grad_norm": 0.9277540445327759, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5557, + "step": 21111 + }, + { + "epoch": 0.28066495637426864, + "grad_norm": 0.8744814395904541, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5568, + "step": 21142 + }, + { + "epoch": 0.2810764885683658, + "grad_norm": 1.0164190530776978, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5433, + "step": 21173 + }, + { + "epoch": 0.28148802076246293, + "grad_norm": 0.8906095623970032, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.5493, + "step": 21204 + }, + { + "epoch": 0.2818995529565601, + "grad_norm": 0.8932943940162659, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.551, + "step": 21235 + }, + { + "epoch": 0.2823110851506573, + "grad_norm": 0.9328072667121887, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5543, + "step": 21266 + }, + { + "epoch": 0.28272261734475446, + "grad_norm": 0.8685097694396973, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5618, + "step": 21297 + }, + { + "epoch": 0.28313414953885163, + "grad_norm": 0.8566640615463257, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.5512, + "step": 21328 + }, + { + "epoch": 0.2835456817329488, + "grad_norm": 0.8968601226806641, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5429, + "step": 21359 + }, + { + "epoch": 0.283957213927046, + "grad_norm": 0.8937885761260986, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.5522, + "step": 21390 + }, + { + "epoch": 0.28436874612114316, + "grad_norm": 0.9389865398406982, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5566, + "step": 21421 + }, + { + "epoch": 0.2847802783152403, + "grad_norm": 0.9788251519203186, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5532, + "step": 21452 + }, + { + "epoch": 0.28519181050933745, + "grad_norm": 0.8652181029319763, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5524, + "step": 21483 + }, + { + "epoch": 0.2856033427034346, + "grad_norm": 0.9210936427116394, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5529, + "step": 21514 + }, + { + "epoch": 0.2860148748975318, + "grad_norm": 0.9828045964241028, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5513, + "step": 21545 + }, + { + "epoch": 0.286426407091629, + "grad_norm": 0.868962287902832, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5467, + "step": 21576 + }, + { + "epoch": 0.28683793928572615, + "grad_norm": 0.8329687714576721, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5542, + "step": 21607 + }, + { + "epoch": 0.2872494714798233, + "grad_norm": 0.7887142300605774, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5417, + "step": 21638 + }, + { + "epoch": 0.2876610036739205, + "grad_norm": 0.8512480854988098, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5538, + "step": 21669 + }, + { + "epoch": 0.2880725358680176, + "grad_norm": 0.9043695330619812, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5494, + "step": 21700 + }, + { + "epoch": 0.2884840680621148, + "grad_norm": 0.9565821886062622, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5489, + "step": 21731 + }, + { + "epoch": 0.28889560025621197, + "grad_norm": 0.8471581935882568, + "learning_rate": 1.043211714185722e-05, + "loss": 0.5512, + "step": 21762 + }, + { + "epoch": 0.28930713245030915, + "grad_norm": 0.9377114176750183, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5526, + "step": 21793 + }, + { + "epoch": 0.2897186646444063, + "grad_norm": 0.9441999793052673, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.5503, + "step": 21824 + }, + { + "epoch": 0.2901301968385035, + "grad_norm": 0.9086009860038757, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.545, + "step": 21855 + }, + { + "epoch": 0.29054172903260067, + "grad_norm": 0.889674961566925, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5569, + "step": 21886 + }, + { + "epoch": 0.29095326122669785, + "grad_norm": 0.8675930500030518, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5547, + "step": 21917 + }, + { + "epoch": 0.29136479342079497, + "grad_norm": 0.8342081904411316, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5461, + "step": 21948 + }, + { + "epoch": 0.29177632561489214, + "grad_norm": 0.9048583507537842, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5457, + "step": 21979 + }, + { + "epoch": 0.2921878578089893, + "grad_norm": 0.9375602602958679, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5568, + "step": 22010 + }, + { + "epoch": 0.2925993900030865, + "grad_norm": 0.8803778886795044, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5503, + "step": 22041 + }, + { + "epoch": 0.29301092219718367, + "grad_norm": 0.8693305850028992, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5485, + "step": 22072 + }, + { + "epoch": 0.29342245439128084, + "grad_norm": 0.8868476748466492, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5588, + "step": 22103 + }, + { + "epoch": 0.293833986585378, + "grad_norm": 0.8572340607643127, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5507, + "step": 22134 + }, + { + "epoch": 0.2942455187794752, + "grad_norm": 0.922905445098877, + "learning_rate": 9.559031610258007e-06, + "loss": 0.5478, + "step": 22165 + }, + { + "epoch": 0.2946570509735723, + "grad_norm": 0.8140031695365906, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5467, + "step": 22196 + }, + { + "epoch": 0.2950685831676695, + "grad_norm": 0.8945645093917847, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5478, + "step": 22227 + }, + { + "epoch": 0.29548011536176666, + "grad_norm": 0.8615440726280212, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5522, + "step": 22258 + }, + { + "epoch": 0.29589164755586383, + "grad_norm": 0.8636476993560791, + "learning_rate": 9.29623762843734e-06, + "loss": 0.548, + "step": 22289 + }, + { + "epoch": 0.296303179749961, + "grad_norm": 0.842241108417511, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5467, + "step": 22320 + }, + { + "epoch": 0.2967147119440582, + "grad_norm": 0.8380717039108276, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5438, + "step": 22351 + }, + { + "epoch": 0.29712624413815536, + "grad_norm": 0.8693488240242004, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5523, + "step": 22382 + }, + { + "epoch": 0.29753777633225253, + "grad_norm": 0.9079211354255676, + "learning_rate": 9.036279043803565e-06, + "loss": 0.5499, + "step": 22413 + }, + { + "epoch": 0.29794930852634965, + "grad_norm": 0.7630789875984192, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5529, + "step": 22444 + }, + { + "epoch": 0.29836084072044683, + "grad_norm": 0.886512279510498, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5463, + "step": 22475 + }, + { + "epoch": 0.298772372914544, + "grad_norm": 0.7641420364379883, + "learning_rate": 8.843199089695293e-06, + "loss": 0.5389, + "step": 22506 + }, + { + "epoch": 0.2991839051086412, + "grad_norm": 0.8912153244018555, + "learning_rate": 8.779202793251311e-06, + "loss": 0.5457, + "step": 22537 + }, + { + "epoch": 0.29959543730273835, + "grad_norm": 0.9104102849960327, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5482, + "step": 22568 + }, + { + "epoch": 0.30000696949683553, + "grad_norm": 0.9206966161727905, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5481, + "step": 22599 + }, + { + "epoch": 0.3004185016909327, + "grad_norm": 0.8885296583175659, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5462, + "step": 22630 + }, + { + "epoch": 0.3008300338850299, + "grad_norm": 0.8395354747772217, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5463, + "step": 22661 + }, + { + "epoch": 0.301241566079127, + "grad_norm": 0.9492244124412537, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5515, + "step": 22692 + }, + { + "epoch": 0.3016530982732242, + "grad_norm": 0.8974335789680481, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5512, + "step": 22723 + }, + { + "epoch": 0.30206463046732135, + "grad_norm": 0.8450007438659668, + "learning_rate": 8.336394285228017e-06, + "loss": 0.549, + "step": 22754 + }, + { + "epoch": 0.3024761626614185, + "grad_norm": 0.8842496275901794, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5426, + "step": 22785 + }, + { + "epoch": 0.3028876948555157, + "grad_norm": 0.875068724155426, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5478, + "step": 22816 + }, + { + "epoch": 0.3032992270496129, + "grad_norm": 0.8835846185684204, + "learning_rate": 8.149425683199823e-06, + "loss": 0.5521, + "step": 22847 + }, + { + "epoch": 0.30371075924371005, + "grad_norm": 0.9190506935119629, + "learning_rate": 8.08748219313325e-06, + "loss": 0.55, + "step": 22878 + }, + { + "epoch": 0.3041222914378072, + "grad_norm": 0.9006677269935608, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5405, + "step": 22909 + }, + { + "epoch": 0.30453382363190434, + "grad_norm": 0.9302480220794678, + "learning_rate": 7.964168476264508e-06, + "loss": 0.5591, + "step": 22940 + }, + { + "epoch": 0.3049453558260015, + "grad_norm": 0.9019137024879456, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5521, + "step": 22971 + }, + { + "epoch": 0.3053568880200987, + "grad_norm": 0.9111758470535278, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5494, + "step": 23002 + }, + { + "epoch": 0.30576842021419587, + "grad_norm": 0.8428525924682617, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5395, + "step": 23033 + }, + { + "epoch": 0.30617995240829304, + "grad_norm": 0.8785557150840759, + "learning_rate": 7.719853532125227e-06, + "loss": 0.5401, + "step": 23064 + }, + { + "epoch": 0.3065914846023902, + "grad_norm": 0.893214225769043, + "learning_rate": 7.65926058865258e-06, + "loss": 0.5503, + "step": 23095 + }, + { + "epoch": 0.3070030167964874, + "grad_norm": 0.9266390800476074, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5415, + "step": 23126 + }, + { + "epoch": 0.30741454899058457, + "grad_norm": 0.839297354221344, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5501, + "step": 23157 + }, + { + "epoch": 0.3078260811846817, + "grad_norm": 0.8542027473449707, + "learning_rate": 7.478658609642211e-06, + "loss": 0.5522, + "step": 23188 + }, + { + "epoch": 0.30823761337877886, + "grad_norm": 0.9187499284744263, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5499, + "step": 23219 + }, + { + "epoch": 0.30864914557287604, + "grad_norm": 0.8777310252189636, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5475, + "step": 23250 + }, + { + "epoch": 0.3090606777669732, + "grad_norm": 0.859711229801178, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5487, + "step": 23281 + }, + { + "epoch": 0.3094722099610704, + "grad_norm": 0.8032732009887695, + "learning_rate": 7.240627257831847e-06, + "loss": 0.545, + "step": 23312 + }, + { + "epoch": 0.30988374215516756, + "grad_norm": 0.7653436064720154, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.5536, + "step": 23343 + }, + { + "epoch": 0.31029527434926474, + "grad_norm": 0.8717504739761353, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5411, + "step": 23374 + }, + { + "epoch": 0.3107068065433619, + "grad_norm": 0.8277837634086609, + "learning_rate": 7.064205712766226e-06, + "loss": 0.5503, + "step": 23405 + }, + { + "epoch": 0.31111833873745903, + "grad_norm": 0.781703770160675, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5464, + "step": 23436 + }, + { + "epoch": 0.3115298709315562, + "grad_norm": 0.8515232801437378, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5556, + "step": 23467 + }, + { + "epoch": 0.3119414031256534, + "grad_norm": 0.9343826770782471, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5514, + "step": 23498 + }, + { + "epoch": 0.31235293531975056, + "grad_norm": 0.9239291548728943, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5414, + "step": 23529 + }, + { + "epoch": 0.31276446751384773, + "grad_norm": 0.8628037571907043, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5486, + "step": 23560 + }, + { + "epoch": 0.3131759997079449, + "grad_norm": 0.8857805728912354, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5426, + "step": 23591 + }, + { + "epoch": 0.3135875319020421, + "grad_norm": 0.8357077836990356, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5494, + "step": 23622 + }, + { + "epoch": 0.31399906409613926, + "grad_norm": 0.8405023813247681, + "learning_rate": 6.602702003200872e-06, + "loss": 0.547, + "step": 23653 + }, + { + "epoch": 0.3144105962902364, + "grad_norm": 0.9647945165634155, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5479, + "step": 23684 + }, + { + "epoch": 0.31482212848433355, + "grad_norm": 0.9345009326934814, + "learning_rate": 6.489389252651057e-06, + "loss": 0.542, + "step": 23715 + }, + { + "epoch": 0.3152336606784307, + "grad_norm": 0.9495857954025269, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.547, + "step": 23746 + }, + { + "epoch": 0.3156451928725279, + "grad_norm": 0.888819694519043, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5417, + "step": 23777 + }, + { + "epoch": 0.3160567250666251, + "grad_norm": 0.8969824910163879, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5467, + "step": 23808 + }, + { + "epoch": 0.31646825726072225, + "grad_norm": 0.8562204241752625, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5507, + "step": 23839 + }, + { + "epoch": 0.3168797894548194, + "grad_norm": 0.9174118638038635, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5508, + "step": 23870 + }, + { + "epoch": 0.3172913216489166, + "grad_norm": 0.872319221496582, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5468, + "step": 23901 + }, + { + "epoch": 0.3177028538430137, + "grad_norm": 0.8186289668083191, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5433, + "step": 23932 + }, + { + "epoch": 0.3181143860371109, + "grad_norm": 0.8317052125930786, + "learning_rate": 6.044544397429958e-06, + "loss": 0.555, + "step": 23963 + }, + { + "epoch": 0.31852591823120807, + "grad_norm": 0.8226687908172607, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5444, + "step": 23994 + }, + { + "epoch": 0.31893745042530525, + "grad_norm": 0.9374111890792847, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5504, + "step": 24025 + }, + { + "epoch": 0.3193489826194024, + "grad_norm": 0.9114209413528442, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5436, + "step": 24056 + }, + { + "epoch": 0.3197605148134996, + "grad_norm": 0.8481084704399109, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5483, + "step": 24087 + }, + { + "epoch": 0.32017204700759677, + "grad_norm": 0.8786484599113464, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5528, + "step": 24118 + }, + { + "epoch": 0.32058357920169395, + "grad_norm": 0.8430096507072449, + "learning_rate": 5.719877233394228e-06, + "loss": 0.5422, + "step": 24149 + }, + { + "epoch": 0.32099511139579107, + "grad_norm": 0.870892345905304, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5465, + "step": 24180 + }, + { + "epoch": 0.32140664358988824, + "grad_norm": 0.924968957901001, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5462, + "step": 24211 + }, + { + "epoch": 0.3218181757839854, + "grad_norm": 0.8519983887672424, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5453, + "step": 24242 + }, + { + "epoch": 0.3222297079780826, + "grad_norm": 0.8353081345558167, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5429, + "step": 24273 + }, + { + "epoch": 0.32264124017217977, + "grad_norm": 0.9054728746414185, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.5519, + "step": 24304 + }, + { + "epoch": 0.32305277236627694, + "grad_norm": 0.9061859250068665, + "learning_rate": 5.403042459894597e-06, + "loss": 0.5458, + "step": 24335 + }, + { + "epoch": 0.3234643045603741, + "grad_norm": 0.7874587774276733, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5478, + "step": 24366 + }, + { + "epoch": 0.3238758367544713, + "grad_norm": 0.8362119793891907, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5465, + "step": 24397 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8013248630772728e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24416/training_args.bin b/checkpoint-24416/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-24416/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-27468/config.json b/checkpoint-27468/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-27468/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-27468/generation_config.json b/checkpoint-27468/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-27468/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-27468/model-00001-of-00007.safetensors b/checkpoint-27468/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c6315c29bbf42b7ad9894ce0b58cb9af2ff0a858 --- /dev/null +++ b/checkpoint-27468/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a595d2916a20a7dd8dccf64f72cf1b863eed594fc138ffb335f924df7448112 +size 4886466168 diff --git a/checkpoint-27468/model-00002-of-00007.safetensors b/checkpoint-27468/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-27468/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-27468/model-00003-of-00007.safetensors b/checkpoint-27468/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-27468/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-27468/model-00004-of-00007.safetensors b/checkpoint-27468/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-27468/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-27468/model-00005-of-00007.safetensors b/checkpoint-27468/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-27468/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-27468/model-00006-of-00007.safetensors b/checkpoint-27468/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3357141a9df36e901764c7cf212a024a014ad91c --- /dev/null +++ b/checkpoint-27468/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb2349ba626060ee1840f5504531f093564ad6e9911fa66e93694d3ff5af498 +size 4999813120 diff --git a/checkpoint-27468/model-00007-of-00007.safetensors b/checkpoint-27468/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dd68ef902632b1f6e17c678b6aca1cdd3a18f6c --- /dev/null +++ b/checkpoint-27468/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:352da5c6836acaa137ab5b4ea048f1f92a954b8f7694b46ad6bac9d54adca1c0 +size 2571158184 diff --git a/checkpoint-27468/model.safetensors.index.json b/checkpoint-27468/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-27468/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-27468/optimizer.pt b/checkpoint-27468/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..08f81d6209a16b5d7aef34806c5083e7a23390f5 --- /dev/null +++ b/checkpoint-27468/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deb7f2153a6f3adc62a6d4e928171e0afef324e9d15ea4e2573a7ab950863522 +size 15385036334 diff --git a/checkpoint-27468/rng_state.pth b/checkpoint-27468/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-27468/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-27468/scheduler.pt b/checkpoint-27468/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec6a33a1e16fa727d72d8610d56b97fd04ba15e3 --- /dev/null +++ b/checkpoint-27468/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d47b007e64bffbb0dc51c02560ea2fea14f1ab5035228332be1bd00a38697eb +size 1064 diff --git a/checkpoint-27468/trainer_state.json b/checkpoint-27468/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..901e743e2f9ea0de0ebab2189862b8837903ef7e --- /dev/null +++ b/checkpoint-27468/trainer_state.json @@ -0,0 +1,6235 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3646440744342262, + "eval_steps": 500, + "global_step": 27468, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + }, + { + "epoch": 0.1621436844742842, + "grad_norm": 0.952920138835907, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5843, + "step": 12214 + }, + { + "epoch": 0.16255521666838138, + "grad_norm": 0.9184344410896301, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5748, + "step": 12245 + }, + { + "epoch": 0.16296674886247856, + "grad_norm": 0.9797667264938354, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5788, + "step": 12276 + }, + { + "epoch": 0.16337828105657573, + "grad_norm": 0.968781054019928, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5835, + "step": 12307 + }, + { + "epoch": 0.16378981325067288, + "grad_norm": 1.0632243156433105, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5742, + "step": 12338 + }, + { + "epoch": 0.16420134544477005, + "grad_norm": 0.9538365006446838, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5737, + "step": 12369 + }, + { + "epoch": 0.16461287763886723, + "grad_norm": 0.9336950778961182, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5821, + "step": 12400 + }, + { + "epoch": 0.1650244098329644, + "grad_norm": 0.9611359238624573, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5847, + "step": 12431 + }, + { + "epoch": 0.16543594202706155, + "grad_norm": 1.0813709497451782, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5805, + "step": 12462 + }, + { + "epoch": 0.16584747422115872, + "grad_norm": 0.9780453443527222, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5761, + "step": 12493 + }, + { + "epoch": 0.1662590064152559, + "grad_norm": 0.9629074335098267, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.58, + "step": 12524 + }, + { + "epoch": 0.16667053860935307, + "grad_norm": 0.9241612553596497, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5732, + "step": 12555 + }, + { + "epoch": 0.16708207080345022, + "grad_norm": 1.0314536094665527, + "learning_rate": 3.409364314116074e-05, + "loss": 0.578, + "step": 12586 + }, + { + "epoch": 0.1674936029975474, + "grad_norm": 0.9907404184341431, + "learning_rate": 3.401536249920559e-05, + "loss": 0.591, + "step": 12617 + }, + { + "epoch": 0.16790513519164457, + "grad_norm": 0.9442338943481445, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5754, + "step": 12648 + }, + { + "epoch": 0.16831666738574175, + "grad_norm": 0.9101460576057434, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5815, + "step": 12679 + }, + { + "epoch": 0.1687281995798389, + "grad_norm": 1.029105544090271, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5766, + "step": 12710 + }, + { + "epoch": 0.16913973177393607, + "grad_norm": 1.028542160987854, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5769, + "step": 12741 + }, + { + "epoch": 0.16955126396803324, + "grad_norm": 0.9411474466323853, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5808, + "step": 12772 + }, + { + "epoch": 0.16996279616213042, + "grad_norm": 0.9260859489440918, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5683, + "step": 12803 + }, + { + "epoch": 0.17037432835622757, + "grad_norm": 0.9867289066314697, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5781, + "step": 12834 + }, + { + "epoch": 0.17078586055032474, + "grad_norm": 0.9310885071754456, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5876, + "step": 12865 + }, + { + "epoch": 0.17119739274442192, + "grad_norm": 1.1955841779708862, + "learning_rate": 3.330636493090868e-05, + "loss": 0.577, + "step": 12896 + }, + { + "epoch": 0.1716089249385191, + "grad_norm": 1.0715487003326416, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5835, + "step": 12927 + }, + { + "epoch": 0.17202045713261624, + "grad_norm": 0.9181262850761414, + "learning_rate": 3.314775593945251e-05, + "loss": 0.5733, + "step": 12958 + }, + { + "epoch": 0.17243198932671341, + "grad_norm": 0.9595162868499756, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5685, + "step": 12989 + }, + { + "epoch": 0.1728435215208106, + "grad_norm": 1.057614803314209, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5736, + "step": 13020 + }, + { + "epoch": 0.17325505371490776, + "grad_norm": 0.9654355049133301, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5782, + "step": 13051 + }, + { + "epoch": 0.1736665859090049, + "grad_norm": 1.0919837951660156, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5771, + "step": 13082 + }, + { + "epoch": 0.1740781181031021, + "grad_norm": 1.064331293106079, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5731, + "step": 13113 + }, + { + "epoch": 0.17448965029719926, + "grad_norm": 0.8971393704414368, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5855, + "step": 13144 + }, + { + "epoch": 0.17490118249129644, + "grad_norm": 0.9515472054481506, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5776, + "step": 13175 + }, + { + "epoch": 0.17531271468539358, + "grad_norm": 0.9811455607414246, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5817, + "step": 13206 + }, + { + "epoch": 0.17572424687949076, + "grad_norm": 0.9187535643577576, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5751, + "step": 13237 + }, + { + "epoch": 0.17613577907358793, + "grad_norm": 0.9396876096725464, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5744, + "step": 13268 + }, + { + "epoch": 0.1765473112676851, + "grad_norm": 1.064921259880066, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5731, + "step": 13299 + }, + { + "epoch": 0.17695884346178226, + "grad_norm": 0.946753978729248, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5824, + "step": 13330 + }, + { + "epoch": 0.17737037565587943, + "grad_norm": 0.8623449206352234, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5764, + "step": 13361 + }, + { + "epoch": 0.1777819078499766, + "grad_norm": 0.9526584148406982, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5696, + "step": 13392 + }, + { + "epoch": 0.17819344004407378, + "grad_norm": 0.9413353800773621, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5688, + "step": 13423 + }, + { + "epoch": 0.17860497223817093, + "grad_norm": 0.9612322449684143, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5694, + "step": 13454 + }, + { + "epoch": 0.1790165044322681, + "grad_norm": 1.0289851427078247, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5799, + "step": 13485 + }, + { + "epoch": 0.17942803662636528, + "grad_norm": 0.9124505519866943, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5729, + "step": 13516 + }, + { + "epoch": 0.17983956882046245, + "grad_norm": 0.9874391555786133, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5761, + "step": 13547 + }, + { + "epoch": 0.1802511010145596, + "grad_norm": 1.0058220624923706, + "learning_rate": 3.154262717052985e-05, + "loss": 0.58, + "step": 13578 + }, + { + "epoch": 0.18066263320865678, + "grad_norm": 0.9753491282463074, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5749, + "step": 13609 + }, + { + "epoch": 0.18107416540275395, + "grad_norm": 1.0167535543441772, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5848, + "step": 13640 + }, + { + "epoch": 0.18148569759685113, + "grad_norm": 0.9970638155937195, + "learning_rate": 3.129913267924946e-05, + "loss": 0.5743, + "step": 13671 + }, + { + "epoch": 0.18189722979094827, + "grad_norm": 1.0033198595046997, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5747, + "step": 13702 + }, + { + "epoch": 0.18230876198504545, + "grad_norm": 1.0107535123825073, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5725, + "step": 13733 + }, + { + "epoch": 0.18272029417914262, + "grad_norm": 0.9341425895690918, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5726, + "step": 13764 + }, + { + "epoch": 0.1831318263732398, + "grad_norm": 1.0088342428207397, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5737, + "step": 13795 + }, + { + "epoch": 0.18354335856733694, + "grad_norm": 0.8871366381645203, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.576, + "step": 13826 + }, + { + "epoch": 0.18395489076143412, + "grad_norm": 0.971552848815918, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.57, + "step": 13857 + }, + { + "epoch": 0.1843664229555313, + "grad_norm": 1.1167266368865967, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5652, + "step": 13888 + }, + { + "epoch": 0.18477795514962847, + "grad_norm": 0.9768183827400208, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5671, + "step": 13919 + }, + { + "epoch": 0.18518948734372562, + "grad_norm": 1.0097460746765137, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5771, + "step": 13950 + }, + { + "epoch": 0.1856010195378228, + "grad_norm": 1.010986089706421, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5723, + "step": 13981 + }, + { + "epoch": 0.18601255173191997, + "grad_norm": 0.8861789107322693, + "learning_rate": 3.040103481317539e-05, + "loss": 0.5798, + "step": 14012 + }, + { + "epoch": 0.18642408392601714, + "grad_norm": 0.9811322689056396, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5703, + "step": 14043 + }, + { + "epoch": 0.1868356161201143, + "grad_norm": 0.9183774590492249, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5655, + "step": 14074 + }, + { + "epoch": 0.18724714831421146, + "grad_norm": 0.9655874371528625, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.57, + "step": 14105 + }, + { + "epoch": 0.18765868050830864, + "grad_norm": 0.9735666513442993, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5748, + "step": 14136 + }, + { + "epoch": 0.18807021270240581, + "grad_norm": 0.9084784388542175, + "learning_rate": 2.999029669712431e-05, + "loss": 0.568, + "step": 14167 + }, + { + "epoch": 0.18848174489650296, + "grad_norm": 0.8866302371025085, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5768, + "step": 14198 + }, + { + "epoch": 0.18889327709060014, + "grad_norm": 0.9936773180961609, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5701, + "step": 14229 + }, + { + "epoch": 0.1893048092846973, + "grad_norm": 0.9721146821975708, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5638, + "step": 14260 + }, + { + "epoch": 0.1897163414787945, + "grad_norm": 0.921503484249115, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5645, + "step": 14291 + }, + { + "epoch": 0.19012787367289163, + "grad_norm": 0.9194900989532471, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5724, + "step": 14322 + }, + { + "epoch": 0.1905394058669888, + "grad_norm": 0.9940860271453857, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5731, + "step": 14353 + }, + { + "epoch": 0.19095093806108598, + "grad_norm": 0.9163900017738342, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5761, + "step": 14384 + }, + { + "epoch": 0.19136247025518316, + "grad_norm": 0.960118293762207, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.572, + "step": 14415 + }, + { + "epoch": 0.1917740024492803, + "grad_norm": 0.9942934513092041, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5606, + "step": 14446 + }, + { + "epoch": 0.19218553464337748, + "grad_norm": 0.9903921484947205, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5704, + "step": 14477 + }, + { + "epoch": 0.19259706683747466, + "grad_norm": 0.9734652638435364, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.5768, + "step": 14508 + }, + { + "epoch": 0.19300859903157183, + "grad_norm": 0.9251719117164612, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.565, + "step": 14539 + }, + { + "epoch": 0.19342013122566898, + "grad_norm": 0.9734142422676086, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5702, + "step": 14570 + }, + { + "epoch": 0.19383166341976615, + "grad_norm": 0.9692136645317078, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5581, + "step": 14601 + }, + { + "epoch": 0.19424319561386333, + "grad_norm": 0.9654501676559448, + "learning_rate": 2.875010077160754e-05, + "loss": 0.5667, + "step": 14632 + }, + { + "epoch": 0.1946547278079605, + "grad_norm": 1.0234603881835938, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5688, + "step": 14663 + }, + { + "epoch": 0.19506626000205765, + "grad_norm": 0.9619661569595337, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5713, + "step": 14694 + }, + { + "epoch": 0.19547779219615483, + "grad_norm": 1.0468534231185913, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5702, + "step": 14725 + }, + { + "epoch": 0.195889324390252, + "grad_norm": 0.9849717617034912, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5695, + "step": 14756 + }, + { + "epoch": 0.19630085658434918, + "grad_norm": 0.8887643218040466, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5688, + "step": 14787 + }, + { + "epoch": 0.19671238877844632, + "grad_norm": 0.9583494067192078, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5775, + "step": 14818 + }, + { + "epoch": 0.1971239209725435, + "grad_norm": 0.9766852259635925, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5688, + "step": 14849 + }, + { + "epoch": 0.19753545316664067, + "grad_norm": 0.9450570940971375, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5773, + "step": 14880 + }, + { + "epoch": 0.19794698536073785, + "grad_norm": 0.9476996064186096, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5688, + "step": 14911 + }, + { + "epoch": 0.198358517554835, + "grad_norm": 1.0239835977554321, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5717, + "step": 14942 + }, + { + "epoch": 0.19877004974893217, + "grad_norm": 0.9848045110702515, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5691, + "step": 14973 + }, + { + "epoch": 0.19918158194302935, + "grad_norm": 1.0350494384765625, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5808, + "step": 15004 + }, + { + "epoch": 0.19959311413712652, + "grad_norm": 0.8743448853492737, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5694, + "step": 15035 + }, + { + "epoch": 0.20000464633122367, + "grad_norm": 0.9410389065742493, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5739, + "step": 15066 + }, + { + "epoch": 0.20041617852532084, + "grad_norm": 1.0113860368728638, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5689, + "step": 15097 + }, + { + "epoch": 0.20082771071941802, + "grad_norm": 0.998852550983429, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.5619, + "step": 15128 + }, + { + "epoch": 0.2012392429135152, + "grad_norm": 0.9299794435501099, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5672, + "step": 15159 + }, + { + "epoch": 0.20165077510761234, + "grad_norm": 1.0109282732009888, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5691, + "step": 15190 + }, + { + "epoch": 0.20206230730170952, + "grad_norm": 0.8805022835731506, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5601, + "step": 15221 + }, + { + "epoch": 0.2024738394958067, + "grad_norm": 0.8976924419403076, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5669, + "step": 15252 + }, + { + "epoch": 0.20288537168990387, + "grad_norm": 0.8639585375785828, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5713, + "step": 15283 + }, + { + "epoch": 0.203296903884001, + "grad_norm": 0.9253800511360168, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5634, + "step": 15314 + }, + { + "epoch": 0.2037084360780982, + "grad_norm": 0.8547073006629944, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5705, + "step": 15345 + }, + { + "epoch": 0.20411996827219536, + "grad_norm": 0.8723642230033875, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.573, + "step": 15376 + }, + { + "epoch": 0.20453150046629254, + "grad_norm": 0.9164481163024902, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5645, + "step": 15407 + }, + { + "epoch": 0.20494303266038968, + "grad_norm": 0.9538819193840027, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5677, + "step": 15438 + }, + { + "epoch": 0.20535456485448686, + "grad_norm": 0.8995161652565002, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5683, + "step": 15469 + }, + { + "epoch": 0.20576609704858403, + "grad_norm": 0.9026926755905151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5694, + "step": 15500 + }, + { + "epoch": 0.2061776292426812, + "grad_norm": 0.9095093011856079, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5722, + "step": 15531 + }, + { + "epoch": 0.20658916143677836, + "grad_norm": 0.874626636505127, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5652, + "step": 15562 + }, + { + "epoch": 0.20700069363087553, + "grad_norm": 1.0359785556793213, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.57, + "step": 15593 + }, + { + "epoch": 0.2074122258249727, + "grad_norm": 0.9145928621292114, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5637, + "step": 15624 + }, + { + "epoch": 0.20782375801906988, + "grad_norm": 1.020246982574463, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5619, + "step": 15655 + }, + { + "epoch": 0.20823529021316703, + "grad_norm": 0.8766633868217468, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.5631, + "step": 15686 + }, + { + "epoch": 0.2086468224072642, + "grad_norm": 0.9841639399528503, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.5598, + "step": 15717 + }, + { + "epoch": 0.20905835460136138, + "grad_norm": 0.8983998894691467, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5723, + "step": 15748 + }, + { + "epoch": 0.20946988679545855, + "grad_norm": 0.8868324756622314, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5801, + "step": 15779 + }, + { + "epoch": 0.2098814189895557, + "grad_norm": 0.9000539183616638, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5672, + "step": 15810 + }, + { + "epoch": 0.21029295118365288, + "grad_norm": 0.9193928837776184, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.5658, + "step": 15841 + }, + { + "epoch": 0.21070448337775005, + "grad_norm": 0.9424473643302917, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5685, + "step": 15872 + }, + { + "epoch": 0.21111601557184723, + "grad_norm": 0.9552715420722961, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5725, + "step": 15903 + }, + { + "epoch": 0.21152754776594437, + "grad_norm": 0.8888420462608337, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5649, + "step": 15934 + }, + { + "epoch": 0.21193907996004155, + "grad_norm": 0.906830370426178, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5692, + "step": 15965 + }, + { + "epoch": 0.21235061215413872, + "grad_norm": 0.8939186334609985, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5665, + "step": 15996 + }, + { + "epoch": 0.2127621443482359, + "grad_norm": 1.0149410963058472, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5632, + "step": 16027 + }, + { + "epoch": 0.21317367654233305, + "grad_norm": 0.963056206703186, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5656, + "step": 16058 + }, + { + "epoch": 0.21358520873643022, + "grad_norm": 0.8071532249450684, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5667, + "step": 16089 + }, + { + "epoch": 0.2139967409305274, + "grad_norm": 0.9192640781402588, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5709, + "step": 16120 + }, + { + "epoch": 0.21440827312462457, + "grad_norm": 0.84633868932724, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5658, + "step": 16151 + }, + { + "epoch": 0.21481980531872172, + "grad_norm": 0.8883370757102966, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.5683, + "step": 16182 + }, + { + "epoch": 0.2152313375128189, + "grad_norm": 0.8919095396995544, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.568, + "step": 16213 + }, + { + "epoch": 0.21564286970691607, + "grad_norm": 0.9360633492469788, + "learning_rate": 2.439728136286796e-05, + "loss": 0.565, + "step": 16244 + }, + { + "epoch": 0.21605440190101324, + "grad_norm": 0.9496976733207703, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5627, + "step": 16275 + }, + { + "epoch": 0.2164659340951104, + "grad_norm": 0.9771477580070496, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5614, + "step": 16306 + }, + { + "epoch": 0.21687746628920757, + "grad_norm": 0.931249737739563, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5649, + "step": 16337 + }, + { + "epoch": 0.21728899848330474, + "grad_norm": 0.9592285752296448, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5646, + "step": 16368 + }, + { + "epoch": 0.21770053067740192, + "grad_norm": 0.9159988164901733, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5672, + "step": 16399 + }, + { + "epoch": 0.21811206287149906, + "grad_norm": 0.97376549243927, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5607, + "step": 16430 + }, + { + "epoch": 0.21852359506559624, + "grad_norm": 0.8469638824462891, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5597, + "step": 16461 + }, + { + "epoch": 0.2189351272596934, + "grad_norm": 1.030610203742981, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5625, + "step": 16492 + }, + { + "epoch": 0.2193466594537906, + "grad_norm": 0.9524822235107422, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5621, + "step": 16523 + }, + { + "epoch": 0.21975819164788774, + "grad_norm": 0.9608604311943054, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5672, + "step": 16554 + }, + { + "epoch": 0.2201697238419849, + "grad_norm": 0.9253712296485901, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5609, + "step": 16585 + }, + { + "epoch": 0.22058125603608209, + "grad_norm": 0.923402726650238, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5654, + "step": 16616 + }, + { + "epoch": 0.22099278823017926, + "grad_norm": 1.0030287504196167, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5576, + "step": 16647 + }, + { + "epoch": 0.2214043204242764, + "grad_norm": 0.9106613993644714, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5653, + "step": 16678 + }, + { + "epoch": 0.22181585261837358, + "grad_norm": 1.0058101415634155, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5749, + "step": 16709 + }, + { + "epoch": 0.22222738481247076, + "grad_norm": 0.931086540222168, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5637, + "step": 16740 + }, + { + "epoch": 0.22263891700656793, + "grad_norm": 0.9743716716766357, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5718, + "step": 16771 + }, + { + "epoch": 0.22305044920066508, + "grad_norm": 0.8751611709594727, + "learning_rate": 2.288805948824212e-05, + "loss": 0.5581, + "step": 16802 + }, + { + "epoch": 0.22346198139476225, + "grad_norm": 0.867038905620575, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5664, + "step": 16833 + }, + { + "epoch": 0.22387351358885943, + "grad_norm": 0.8663344383239746, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.559, + "step": 16864 + }, + { + "epoch": 0.2242850457829566, + "grad_norm": 0.984854519367218, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5626, + "step": 16895 + }, + { + "epoch": 0.22469657797705375, + "grad_norm": 0.9031103849411011, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5666, + "step": 16926 + }, + { + "epoch": 0.22510811017115093, + "grad_norm": 0.8782587647438049, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5501, + "step": 16957 + }, + { + "epoch": 0.2255196423652481, + "grad_norm": 1.0644887685775757, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.5604, + "step": 16988 + }, + { + "epoch": 0.22593117455934528, + "grad_norm": 0.8691216111183167, + "learning_rate": 2.230292185905114e-05, + "loss": 0.5649, + "step": 17019 + }, + { + "epoch": 0.22634270675344242, + "grad_norm": 0.9518167972564697, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.5598, + "step": 17050 + }, + { + "epoch": 0.2267542389475396, + "grad_norm": 0.889673113822937, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5626, + "step": 17081 + }, + { + "epoch": 0.22716577114163677, + "grad_norm": 0.9073772430419922, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5607, + "step": 17112 + }, + { + "epoch": 0.22757730333573395, + "grad_norm": 0.9674621820449829, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5651, + "step": 17143 + }, + { + "epoch": 0.2279888355298311, + "grad_norm": 0.8547524809837341, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5636, + "step": 17174 + }, + { + "epoch": 0.22840036772392827, + "grad_norm": 1.00649893283844, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5597, + "step": 17205 + }, + { + "epoch": 0.22881189991802545, + "grad_norm": 0.9329107999801636, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5705, + "step": 17236 + }, + { + "epoch": 0.22922343211212262, + "grad_norm": 1.0364869832992554, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.56, + "step": 17267 + }, + { + "epoch": 0.22963496430621977, + "grad_norm": 0.898383617401123, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5634, + "step": 17298 + }, + { + "epoch": 0.23004649650031694, + "grad_norm": 0.903266429901123, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5692, + "step": 17329 + }, + { + "epoch": 0.23045802869441412, + "grad_norm": 0.835216224193573, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5619, + "step": 17360 + }, + { + "epoch": 0.2308695608885113, + "grad_norm": 0.9033771753311157, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5661, + "step": 17391 + }, + { + "epoch": 0.23128109308260844, + "grad_norm": 0.8425393104553223, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5607, + "step": 17422 + }, + { + "epoch": 0.23169262527670562, + "grad_norm": 0.8765662908554077, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5654, + "step": 17453 + }, + { + "epoch": 0.2321041574708028, + "grad_norm": 0.8663944602012634, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5644, + "step": 17484 + }, + { + "epoch": 0.23251568966489997, + "grad_norm": 0.9837983846664429, + "learning_rate": 2.097158366805287e-05, + "loss": 0.5665, + "step": 17515 + }, + { + "epoch": 0.2329272218589971, + "grad_norm": 0.9082325100898743, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5671, + "step": 17546 + }, + { + "epoch": 0.2333387540530943, + "grad_norm": 0.9680993556976318, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5602, + "step": 17577 + }, + { + "epoch": 0.23375028624719146, + "grad_norm": 0.9881089925765991, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5682, + "step": 17608 + }, + { + "epoch": 0.23416181844128864, + "grad_norm": 0.8630657196044922, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5646, + "step": 17639 + }, + { + "epoch": 0.23457335063538579, + "grad_norm": 0.8421202301979065, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.564, + "step": 17670 + }, + { + "epoch": 0.23498488282948296, + "grad_norm": 0.8951789736747742, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5594, + "step": 17701 + }, + { + "epoch": 0.23539641502358014, + "grad_norm": 1.0024628639221191, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5541, + "step": 17732 + }, + { + "epoch": 0.2358079472176773, + "grad_norm": 0.8807896971702576, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5605, + "step": 17763 + }, + { + "epoch": 0.23621947941177446, + "grad_norm": 0.9723889827728271, + "learning_rate": 2.022757379528727e-05, + "loss": 0.559, + "step": 17794 + }, + { + "epoch": 0.23663101160587163, + "grad_norm": 0.9422227740287781, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5594, + "step": 17825 + }, + { + "epoch": 0.2370425437999688, + "grad_norm": 0.9309141039848328, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5621, + "step": 17856 + }, + { + "epoch": 0.23745407599406598, + "grad_norm": 0.8761610388755798, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5665, + "step": 17887 + }, + { + "epoch": 0.23786560818816313, + "grad_norm": 0.8991973400115967, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5726, + "step": 17918 + }, + { + "epoch": 0.2382771403822603, + "grad_norm": 0.8879802227020264, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5598, + "step": 17949 + }, + { + "epoch": 0.23868867257635748, + "grad_norm": 0.9235663414001465, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5637, + "step": 17980 + }, + { + "epoch": 0.23910020477045466, + "grad_norm": 0.9140569567680359, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5571, + "step": 18011 + }, + { + "epoch": 0.2395117369645518, + "grad_norm": 0.933430016040802, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.561, + "step": 18042 + }, + { + "epoch": 0.23992326915864898, + "grad_norm": 0.838374674320221, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5634, + "step": 18073 + }, + { + "epoch": 0.24033480135274615, + "grad_norm": 0.9295237064361572, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5604, + "step": 18104 + }, + { + "epoch": 0.24074633354684333, + "grad_norm": 0.880237340927124, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5609, + "step": 18135 + }, + { + "epoch": 0.24115786574094047, + "grad_norm": 0.9782423973083496, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5479, + "step": 18166 + }, + { + "epoch": 0.24156939793503765, + "grad_norm": 0.97150719165802, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5624, + "step": 18197 + }, + { + "epoch": 0.24198093012913482, + "grad_norm": 0.9634605050086975, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5661, + "step": 18228 + }, + { + "epoch": 0.242392462323232, + "grad_norm": 0.8706396222114563, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5582, + "step": 18259 + }, + { + "epoch": 0.24280399451732915, + "grad_norm": 0.9348079562187195, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5661, + "step": 18290 + }, + { + "epoch": 0.24321552671142632, + "grad_norm": 0.8249440789222717, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5589, + "step": 18321 + }, + { + "epoch": 0.2436270589055235, + "grad_norm": 0.9206597208976746, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5583, + "step": 18352 + }, + { + "epoch": 0.24403859109962067, + "grad_norm": 0.8377333879470825, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5571, + "step": 18383 + }, + { + "epoch": 0.24445012329371782, + "grad_norm": 0.9113277792930603, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5633, + "step": 18414 + }, + { + "epoch": 0.244861655487815, + "grad_norm": 0.9409834742546082, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5588, + "step": 18445 + }, + { + "epoch": 0.24527318768191217, + "grad_norm": 0.9693152904510498, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5621, + "step": 18476 + }, + { + "epoch": 0.24568471987600934, + "grad_norm": 0.9358701705932617, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5654, + "step": 18507 + }, + { + "epoch": 0.2460962520701065, + "grad_norm": 0.9669011831283569, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5605, + "step": 18538 + }, + { + "epoch": 0.24650778426420367, + "grad_norm": 0.9862536191940308, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5625, + "step": 18569 + }, + { + "epoch": 0.24691931645830084, + "grad_norm": 1.069492220878601, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5632, + "step": 18600 + }, + { + "epoch": 0.24733084865239802, + "grad_norm": 0.9141196608543396, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5599, + "step": 18631 + }, + { + "epoch": 0.24774238084649516, + "grad_norm": 0.8525174856185913, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5598, + "step": 18662 + }, + { + "epoch": 0.24815391304059234, + "grad_norm": 0.9469859600067139, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5554, + "step": 18693 + }, + { + "epoch": 0.2485654452346895, + "grad_norm": 0.9280170202255249, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5626, + "step": 18724 + }, + { + "epoch": 0.2489769774287867, + "grad_norm": 0.868431806564331, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5636, + "step": 18755 + }, + { + "epoch": 0.24938850962288384, + "grad_norm": 0.9638091921806335, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5561, + "step": 18786 + }, + { + "epoch": 0.249800041816981, + "grad_norm": 0.9236721396446228, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5645, + "step": 18817 + }, + { + "epoch": 0.2502115740110782, + "grad_norm": 0.8757562041282654, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5549, + "step": 18848 + }, + { + "epoch": 0.25062310620517536, + "grad_norm": 0.9709885120391846, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5613, + "step": 18879 + }, + { + "epoch": 0.25103463839927254, + "grad_norm": 0.9142551422119141, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5452, + "step": 18910 + }, + { + "epoch": 0.2514461705933697, + "grad_norm": 1.4749113321304321, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.556, + "step": 18941 + }, + { + "epoch": 0.25185770278746683, + "grad_norm": 0.8948887586593628, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5551, + "step": 18972 + }, + { + "epoch": 0.252269234981564, + "grad_norm": 0.8812825679779053, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.5549, + "step": 19003 + }, + { + "epoch": 0.2526807671756612, + "grad_norm": 0.8759215474128723, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5517, + "step": 19034 + }, + { + "epoch": 0.25309229936975836, + "grad_norm": 0.8355596661567688, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5651, + "step": 19065 + }, + { + "epoch": 0.25350383156385553, + "grad_norm": 0.9597409963607788, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5636, + "step": 19096 + }, + { + "epoch": 0.2539153637579527, + "grad_norm": 0.9418185949325562, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5551, + "step": 19127 + }, + { + "epoch": 0.2543268959520499, + "grad_norm": 0.9069491028785706, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5628, + "step": 19158 + }, + { + "epoch": 0.25473842814614706, + "grad_norm": 0.8908203840255737, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.561, + "step": 19189 + }, + { + "epoch": 0.2551499603402442, + "grad_norm": 0.8831518888473511, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5595, + "step": 19220 + }, + { + "epoch": 0.25556149253434135, + "grad_norm": 1.0363459587097168, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5555, + "step": 19251 + }, + { + "epoch": 0.2559730247284385, + "grad_norm": 0.8746747970581055, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5581, + "step": 19282 + }, + { + "epoch": 0.2563845569225357, + "grad_norm": 0.7980934381484985, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5537, + "step": 19313 + }, + { + "epoch": 0.2567960891166329, + "grad_norm": 0.851966142654419, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5513, + "step": 19344 + }, + { + "epoch": 0.25720762131073005, + "grad_norm": 0.9124501347541809, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5549, + "step": 19375 + }, + { + "epoch": 0.2576191535048272, + "grad_norm": 1.0416783094406128, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5596, + "step": 19406 + }, + { + "epoch": 0.2580306856989244, + "grad_norm": 0.9024292826652527, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5506, + "step": 19437 + }, + { + "epoch": 0.2584422178930215, + "grad_norm": 0.9234741926193237, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5549, + "step": 19468 + }, + { + "epoch": 0.2588537500871187, + "grad_norm": 0.8676049113273621, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5563, + "step": 19499 + }, + { + "epoch": 0.25926528228121587, + "grad_norm": 0.9481212496757507, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5578, + "step": 19530 + }, + { + "epoch": 0.25967681447531304, + "grad_norm": 0.8709908723831177, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5568, + "step": 19561 + }, + { + "epoch": 0.2600883466694102, + "grad_norm": 0.938412606716156, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5568, + "step": 19592 + }, + { + "epoch": 0.2604998788635074, + "grad_norm": 0.8912078142166138, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.552, + "step": 19623 + }, + { + "epoch": 0.26091141105760457, + "grad_norm": 1.1832647323608398, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5581, + "step": 19654 + }, + { + "epoch": 0.26132294325170174, + "grad_norm": 0.9237463474273682, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5592, + "step": 19685 + }, + { + "epoch": 0.26173447544579886, + "grad_norm": 0.878738522529602, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5557, + "step": 19716 + }, + { + "epoch": 0.26214600763989604, + "grad_norm": 0.9652629494667053, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5542, + "step": 19747 + }, + { + "epoch": 0.2625575398339932, + "grad_norm": 0.9157405495643616, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5609, + "step": 19778 + }, + { + "epoch": 0.2629690720280904, + "grad_norm": 0.840957760810852, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5521, + "step": 19809 + }, + { + "epoch": 0.26338060422218756, + "grad_norm": 0.8824605941772461, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5539, + "step": 19840 + }, + { + "epoch": 0.26379213641628474, + "grad_norm": 0.9319818615913391, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.559, + "step": 19871 + }, + { + "epoch": 0.2642036686103819, + "grad_norm": 0.8822436332702637, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5565, + "step": 19902 + }, + { + "epoch": 0.2646152008044791, + "grad_norm": 0.8802869915962219, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.5603, + "step": 19933 + }, + { + "epoch": 0.2650267329985762, + "grad_norm": 0.913989245891571, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5519, + "step": 19964 + }, + { + "epoch": 0.2654382651926734, + "grad_norm": 0.8885793089866638, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.5567, + "step": 19995 + }, + { + "epoch": 0.26584979738677056, + "grad_norm": 0.8809658885002136, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5516, + "step": 20026 + }, + { + "epoch": 0.26626132958086773, + "grad_norm": 0.9053296446800232, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5573, + "step": 20057 + }, + { + "epoch": 0.2666728617749649, + "grad_norm": 0.8977755904197693, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5496, + "step": 20088 + }, + { + "epoch": 0.2670843939690621, + "grad_norm": 0.935563325881958, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.558, + "step": 20119 + }, + { + "epoch": 0.26749592616315926, + "grad_norm": 1.0321307182312012, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.557, + "step": 20150 + }, + { + "epoch": 0.26790745835725643, + "grad_norm": 0.8926151990890503, + "learning_rate": 1.410916653306954e-05, + "loss": 0.556, + "step": 20181 + }, + { + "epoch": 0.26831899055135355, + "grad_norm": 0.9870996475219727, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5522, + "step": 20212 + }, + { + "epoch": 0.26873052274545073, + "grad_norm": 0.8782408237457275, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.548, + "step": 20243 + }, + { + "epoch": 0.2691420549395479, + "grad_norm": 0.887537956237793, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5548, + "step": 20274 + }, + { + "epoch": 0.2695535871336451, + "grad_norm": 0.9209414720535278, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5531, + "step": 20305 + }, + { + "epoch": 0.26996511932774225, + "grad_norm": 0.8398643732070923, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5582, + "step": 20336 + }, + { + "epoch": 0.27037665152183943, + "grad_norm": 0.9261983036994934, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5534, + "step": 20367 + }, + { + "epoch": 0.2707881837159366, + "grad_norm": 0.9387017488479614, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5536, + "step": 20398 + }, + { + "epoch": 0.2711997159100338, + "grad_norm": 0.9599831700325012, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.552, + "step": 20429 + }, + { + "epoch": 0.2716112481041309, + "grad_norm": 0.8976027965545654, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5582, + "step": 20460 + }, + { + "epoch": 0.2720227802982281, + "grad_norm": 0.890676736831665, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5597, + "step": 20491 + }, + { + "epoch": 0.27243431249232525, + "grad_norm": 0.8950179219245911, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5526, + "step": 20522 + }, + { + "epoch": 0.2728458446864224, + "grad_norm": 0.9863470792770386, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5497, + "step": 20553 + }, + { + "epoch": 0.2732573768805196, + "grad_norm": 0.9474931359291077, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5506, + "step": 20584 + }, + { + "epoch": 0.2736689090746168, + "grad_norm": 0.9262164831161499, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5517, + "step": 20615 + }, + { + "epoch": 0.27408044126871395, + "grad_norm": 0.8490736484527588, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.546, + "step": 20646 + }, + { + "epoch": 0.2744919734628111, + "grad_norm": 0.8594829440116882, + "learning_rate": 1.291596270869846e-05, + "loss": 0.554, + "step": 20677 + }, + { + "epoch": 0.27490350565690824, + "grad_norm": 0.8383352756500244, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5513, + "step": 20708 + }, + { + "epoch": 0.2753150378510054, + "grad_norm": 0.8765247464179993, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5462, + "step": 20739 + }, + { + "epoch": 0.2757265700451026, + "grad_norm": 0.856604814529419, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.562, + "step": 20770 + }, + { + "epoch": 0.27613810223919977, + "grad_norm": 0.8549590706825256, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5518, + "step": 20801 + }, + { + "epoch": 0.27654963443329694, + "grad_norm": 0.9898308515548706, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5499, + "step": 20832 + }, + { + "epoch": 0.2769611666273941, + "grad_norm": 0.8919757604598999, + "learning_rate": 1.247732733176724e-05, + "loss": 0.55, + "step": 20863 + }, + { + "epoch": 0.2773726988214913, + "grad_norm": 0.8670758008956909, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5549, + "step": 20894 + }, + { + "epoch": 0.27778423101558847, + "grad_norm": 0.822809636592865, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.5445, + "step": 20925 + }, + { + "epoch": 0.2781957632096856, + "grad_norm": 0.8837505578994751, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5543, + "step": 20956 + }, + { + "epoch": 0.27860729540378276, + "grad_norm": 0.8370216488838196, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5463, + "step": 20987 + }, + { + "epoch": 0.27901882759787994, + "grad_norm": 0.8596381545066833, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.556, + "step": 21018 + }, + { + "epoch": 0.2794303597919771, + "grad_norm": 0.9435930848121643, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.5539, + "step": 21049 + }, + { + "epoch": 0.2798418919860743, + "grad_norm": 0.8696517944335938, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5573, + "step": 21080 + }, + { + "epoch": 0.28025342418017146, + "grad_norm": 0.9277540445327759, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5557, + "step": 21111 + }, + { + "epoch": 0.28066495637426864, + "grad_norm": 0.8744814395904541, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5568, + "step": 21142 + }, + { + "epoch": 0.2810764885683658, + "grad_norm": 1.0164190530776978, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5433, + "step": 21173 + }, + { + "epoch": 0.28148802076246293, + "grad_norm": 0.8906095623970032, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.5493, + "step": 21204 + }, + { + "epoch": 0.2818995529565601, + "grad_norm": 0.8932943940162659, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.551, + "step": 21235 + }, + { + "epoch": 0.2823110851506573, + "grad_norm": 0.9328072667121887, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5543, + "step": 21266 + }, + { + "epoch": 0.28272261734475446, + "grad_norm": 0.8685097694396973, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5618, + "step": 21297 + }, + { + "epoch": 0.28313414953885163, + "grad_norm": 0.8566640615463257, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.5512, + "step": 21328 + }, + { + "epoch": 0.2835456817329488, + "grad_norm": 0.8968601226806641, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5429, + "step": 21359 + }, + { + "epoch": 0.283957213927046, + "grad_norm": 0.8937885761260986, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.5522, + "step": 21390 + }, + { + "epoch": 0.28436874612114316, + "grad_norm": 0.9389865398406982, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5566, + "step": 21421 + }, + { + "epoch": 0.2847802783152403, + "grad_norm": 0.9788251519203186, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5532, + "step": 21452 + }, + { + "epoch": 0.28519181050933745, + "grad_norm": 0.8652181029319763, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5524, + "step": 21483 + }, + { + "epoch": 0.2856033427034346, + "grad_norm": 0.9210936427116394, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5529, + "step": 21514 + }, + { + "epoch": 0.2860148748975318, + "grad_norm": 0.9828045964241028, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5513, + "step": 21545 + }, + { + "epoch": 0.286426407091629, + "grad_norm": 0.868962287902832, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5467, + "step": 21576 + }, + { + "epoch": 0.28683793928572615, + "grad_norm": 0.8329687714576721, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5542, + "step": 21607 + }, + { + "epoch": 0.2872494714798233, + "grad_norm": 0.7887142300605774, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5417, + "step": 21638 + }, + { + "epoch": 0.2876610036739205, + "grad_norm": 0.8512480854988098, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5538, + "step": 21669 + }, + { + "epoch": 0.2880725358680176, + "grad_norm": 0.9043695330619812, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5494, + "step": 21700 + }, + { + "epoch": 0.2884840680621148, + "grad_norm": 0.9565821886062622, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5489, + "step": 21731 + }, + { + "epoch": 0.28889560025621197, + "grad_norm": 0.8471581935882568, + "learning_rate": 1.043211714185722e-05, + "loss": 0.5512, + "step": 21762 + }, + { + "epoch": 0.28930713245030915, + "grad_norm": 0.9377114176750183, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5526, + "step": 21793 + }, + { + "epoch": 0.2897186646444063, + "grad_norm": 0.9441999793052673, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.5503, + "step": 21824 + }, + { + "epoch": 0.2901301968385035, + "grad_norm": 0.9086009860038757, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.545, + "step": 21855 + }, + { + "epoch": 0.29054172903260067, + "grad_norm": 0.889674961566925, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5569, + "step": 21886 + }, + { + "epoch": 0.29095326122669785, + "grad_norm": 0.8675930500030518, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5547, + "step": 21917 + }, + { + "epoch": 0.29136479342079497, + "grad_norm": 0.8342081904411316, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5461, + "step": 21948 + }, + { + "epoch": 0.29177632561489214, + "grad_norm": 0.9048583507537842, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5457, + "step": 21979 + }, + { + "epoch": 0.2921878578089893, + "grad_norm": 0.9375602602958679, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5568, + "step": 22010 + }, + { + "epoch": 0.2925993900030865, + "grad_norm": 0.8803778886795044, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5503, + "step": 22041 + }, + { + "epoch": 0.29301092219718367, + "grad_norm": 0.8693305850028992, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5485, + "step": 22072 + }, + { + "epoch": 0.29342245439128084, + "grad_norm": 0.8868476748466492, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5588, + "step": 22103 + }, + { + "epoch": 0.293833986585378, + "grad_norm": 0.8572340607643127, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5507, + "step": 22134 + }, + { + "epoch": 0.2942455187794752, + "grad_norm": 0.922905445098877, + "learning_rate": 9.559031610258007e-06, + "loss": 0.5478, + "step": 22165 + }, + { + "epoch": 0.2946570509735723, + "grad_norm": 0.8140031695365906, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5467, + "step": 22196 + }, + { + "epoch": 0.2950685831676695, + "grad_norm": 0.8945645093917847, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5478, + "step": 22227 + }, + { + "epoch": 0.29548011536176666, + "grad_norm": 0.8615440726280212, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5522, + "step": 22258 + }, + { + "epoch": 0.29589164755586383, + "grad_norm": 0.8636476993560791, + "learning_rate": 9.29623762843734e-06, + "loss": 0.548, + "step": 22289 + }, + { + "epoch": 0.296303179749961, + "grad_norm": 0.842241108417511, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5467, + "step": 22320 + }, + { + "epoch": 0.2967147119440582, + "grad_norm": 0.8380717039108276, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5438, + "step": 22351 + }, + { + "epoch": 0.29712624413815536, + "grad_norm": 0.8693488240242004, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5523, + "step": 22382 + }, + { + "epoch": 0.29753777633225253, + "grad_norm": 0.9079211354255676, + "learning_rate": 9.036279043803565e-06, + "loss": 0.5499, + "step": 22413 + }, + { + "epoch": 0.29794930852634965, + "grad_norm": 0.7630789875984192, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5529, + "step": 22444 + }, + { + "epoch": 0.29836084072044683, + "grad_norm": 0.886512279510498, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5463, + "step": 22475 + }, + { + "epoch": 0.298772372914544, + "grad_norm": 0.7641420364379883, + "learning_rate": 8.843199089695293e-06, + "loss": 0.5389, + "step": 22506 + }, + { + "epoch": 0.2991839051086412, + "grad_norm": 0.8912153244018555, + "learning_rate": 8.779202793251311e-06, + "loss": 0.5457, + "step": 22537 + }, + { + "epoch": 0.29959543730273835, + "grad_norm": 0.9104102849960327, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5482, + "step": 22568 + }, + { + "epoch": 0.30000696949683553, + "grad_norm": 0.9206966161727905, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5481, + "step": 22599 + }, + { + "epoch": 0.3004185016909327, + "grad_norm": 0.8885296583175659, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5462, + "step": 22630 + }, + { + "epoch": 0.3008300338850299, + "grad_norm": 0.8395354747772217, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5463, + "step": 22661 + }, + { + "epoch": 0.301241566079127, + "grad_norm": 0.9492244124412537, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5515, + "step": 22692 + }, + { + "epoch": 0.3016530982732242, + "grad_norm": 0.8974335789680481, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5512, + "step": 22723 + }, + { + "epoch": 0.30206463046732135, + "grad_norm": 0.8450007438659668, + "learning_rate": 8.336394285228017e-06, + "loss": 0.549, + "step": 22754 + }, + { + "epoch": 0.3024761626614185, + "grad_norm": 0.8842496275901794, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5426, + "step": 22785 + }, + { + "epoch": 0.3028876948555157, + "grad_norm": 0.875068724155426, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5478, + "step": 22816 + }, + { + "epoch": 0.3032992270496129, + "grad_norm": 0.8835846185684204, + "learning_rate": 8.149425683199823e-06, + "loss": 0.5521, + "step": 22847 + }, + { + "epoch": 0.30371075924371005, + "grad_norm": 0.9190506935119629, + "learning_rate": 8.08748219313325e-06, + "loss": 0.55, + "step": 22878 + }, + { + "epoch": 0.3041222914378072, + "grad_norm": 0.9006677269935608, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5405, + "step": 22909 + }, + { + "epoch": 0.30453382363190434, + "grad_norm": 0.9302480220794678, + "learning_rate": 7.964168476264508e-06, + "loss": 0.5591, + "step": 22940 + }, + { + "epoch": 0.3049453558260015, + "grad_norm": 0.9019137024879456, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5521, + "step": 22971 + }, + { + "epoch": 0.3053568880200987, + "grad_norm": 0.9111758470535278, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5494, + "step": 23002 + }, + { + "epoch": 0.30576842021419587, + "grad_norm": 0.8428525924682617, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5395, + "step": 23033 + }, + { + "epoch": 0.30617995240829304, + "grad_norm": 0.8785557150840759, + "learning_rate": 7.719853532125227e-06, + "loss": 0.5401, + "step": 23064 + }, + { + "epoch": 0.3065914846023902, + "grad_norm": 0.893214225769043, + "learning_rate": 7.65926058865258e-06, + "loss": 0.5503, + "step": 23095 + }, + { + "epoch": 0.3070030167964874, + "grad_norm": 0.9266390800476074, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5415, + "step": 23126 + }, + { + "epoch": 0.30741454899058457, + "grad_norm": 0.839297354221344, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5501, + "step": 23157 + }, + { + "epoch": 0.3078260811846817, + "grad_norm": 0.8542027473449707, + "learning_rate": 7.478658609642211e-06, + "loss": 0.5522, + "step": 23188 + }, + { + "epoch": 0.30823761337877886, + "grad_norm": 0.9187499284744263, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5499, + "step": 23219 + }, + { + "epoch": 0.30864914557287604, + "grad_norm": 0.8777310252189636, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5475, + "step": 23250 + }, + { + "epoch": 0.3090606777669732, + "grad_norm": 0.859711229801178, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5487, + "step": 23281 + }, + { + "epoch": 0.3094722099610704, + "grad_norm": 0.8032732009887695, + "learning_rate": 7.240627257831847e-06, + "loss": 0.545, + "step": 23312 + }, + { + "epoch": 0.30988374215516756, + "grad_norm": 0.7653436064720154, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.5536, + "step": 23343 + }, + { + "epoch": 0.31029527434926474, + "grad_norm": 0.8717504739761353, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5411, + "step": 23374 + }, + { + "epoch": 0.3107068065433619, + "grad_norm": 0.8277837634086609, + "learning_rate": 7.064205712766226e-06, + "loss": 0.5503, + "step": 23405 + }, + { + "epoch": 0.31111833873745903, + "grad_norm": 0.781703770160675, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5464, + "step": 23436 + }, + { + "epoch": 0.3115298709315562, + "grad_norm": 0.8515232801437378, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5556, + "step": 23467 + }, + { + "epoch": 0.3119414031256534, + "grad_norm": 0.9343826770782471, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5514, + "step": 23498 + }, + { + "epoch": 0.31235293531975056, + "grad_norm": 0.9239291548728943, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5414, + "step": 23529 + }, + { + "epoch": 0.31276446751384773, + "grad_norm": 0.8628037571907043, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5486, + "step": 23560 + }, + { + "epoch": 0.3131759997079449, + "grad_norm": 0.8857805728912354, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5426, + "step": 23591 + }, + { + "epoch": 0.3135875319020421, + "grad_norm": 0.8357077836990356, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5494, + "step": 23622 + }, + { + "epoch": 0.31399906409613926, + "grad_norm": 0.8405023813247681, + "learning_rate": 6.602702003200872e-06, + "loss": 0.547, + "step": 23653 + }, + { + "epoch": 0.3144105962902364, + "grad_norm": 0.9647945165634155, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5479, + "step": 23684 + }, + { + "epoch": 0.31482212848433355, + "grad_norm": 0.9345009326934814, + "learning_rate": 6.489389252651057e-06, + "loss": 0.542, + "step": 23715 + }, + { + "epoch": 0.3152336606784307, + "grad_norm": 0.9495857954025269, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.547, + "step": 23746 + }, + { + "epoch": 0.3156451928725279, + "grad_norm": 0.888819694519043, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5417, + "step": 23777 + }, + { + "epoch": 0.3160567250666251, + "grad_norm": 0.8969824910163879, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5467, + "step": 23808 + }, + { + "epoch": 0.31646825726072225, + "grad_norm": 0.8562204241752625, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5507, + "step": 23839 + }, + { + "epoch": 0.3168797894548194, + "grad_norm": 0.9174118638038635, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5508, + "step": 23870 + }, + { + "epoch": 0.3172913216489166, + "grad_norm": 0.872319221496582, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5468, + "step": 23901 + }, + { + "epoch": 0.3177028538430137, + "grad_norm": 0.8186289668083191, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5433, + "step": 23932 + }, + { + "epoch": 0.3181143860371109, + "grad_norm": 0.8317052125930786, + "learning_rate": 6.044544397429958e-06, + "loss": 0.555, + "step": 23963 + }, + { + "epoch": 0.31852591823120807, + "grad_norm": 0.8226687908172607, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5444, + "step": 23994 + }, + { + "epoch": 0.31893745042530525, + "grad_norm": 0.9374111890792847, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5504, + "step": 24025 + }, + { + "epoch": 0.3193489826194024, + "grad_norm": 0.9114209413528442, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5436, + "step": 24056 + }, + { + "epoch": 0.3197605148134996, + "grad_norm": 0.8481084704399109, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5483, + "step": 24087 + }, + { + "epoch": 0.32017204700759677, + "grad_norm": 0.8786484599113464, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5528, + "step": 24118 + }, + { + "epoch": 0.32058357920169395, + "grad_norm": 0.8430096507072449, + "learning_rate": 5.719877233394228e-06, + "loss": 0.5422, + "step": 24149 + }, + { + "epoch": 0.32099511139579107, + "grad_norm": 0.870892345905304, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5465, + "step": 24180 + }, + { + "epoch": 0.32140664358988824, + "grad_norm": 0.924968957901001, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5462, + "step": 24211 + }, + { + "epoch": 0.3218181757839854, + "grad_norm": 0.8519983887672424, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5453, + "step": 24242 + }, + { + "epoch": 0.3222297079780826, + "grad_norm": 0.8353081345558167, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5429, + "step": 24273 + }, + { + "epoch": 0.32264124017217977, + "grad_norm": 0.9054728746414185, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.5519, + "step": 24304 + }, + { + "epoch": 0.32305277236627694, + "grad_norm": 0.9061859250068665, + "learning_rate": 5.403042459894597e-06, + "loss": 0.5458, + "step": 24335 + }, + { + "epoch": 0.3234643045603741, + "grad_norm": 0.7874587774276733, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5478, + "step": 24366 + }, + { + "epoch": 0.3238758367544713, + "grad_norm": 0.8362119793891907, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5465, + "step": 24397 + }, + { + "epoch": 0.3242873689485684, + "grad_norm": 0.8235510587692261, + "learning_rate": 5.247602567671625e-06, + "loss": 0.5473, + "step": 24428 + }, + { + "epoch": 0.3246989011426656, + "grad_norm": 0.8052466511726379, + "learning_rate": 5.196234299402603e-06, + "loss": 0.5463, + "step": 24459 + }, + { + "epoch": 0.32511043333676276, + "grad_norm": 0.8947206139564514, + "learning_rate": 5.145089513937865e-06, + "loss": 0.5472, + "step": 24490 + }, + { + "epoch": 0.32552196553085994, + "grad_norm": 0.8857723474502563, + "learning_rate": 5.094168788439369e-06, + "loss": 0.5536, + "step": 24521 + }, + { + "epoch": 0.3259334977249571, + "grad_norm": 0.8428781032562256, + "learning_rate": 5.043472697540594e-06, + "loss": 0.5385, + "step": 24552 + }, + { + "epoch": 0.3263450299190543, + "grad_norm": 0.8772375583648682, + "learning_rate": 4.993001813340012e-06, + "loss": 0.5472, + "step": 24583 + }, + { + "epoch": 0.32675656211315146, + "grad_norm": 0.8726216554641724, + "learning_rate": 4.942756705394702e-06, + "loss": 0.5444, + "step": 24614 + }, + { + "epoch": 0.32716809430724864, + "grad_norm": 0.8458380699157715, + "learning_rate": 4.892737940713884e-06, + "loss": 0.5467, + "step": 24645 + }, + { + "epoch": 0.32757962650134576, + "grad_norm": 0.8258609175682068, + "learning_rate": 4.842946083752511e-06, + "loss": 0.5415, + "step": 24676 + }, + { + "epoch": 0.32799115869544293, + "grad_norm": 0.8285905122756958, + "learning_rate": 4.79338169640493e-06, + "loss": 0.5359, + "step": 24707 + }, + { + "epoch": 0.3284026908895401, + "grad_norm": 0.8557586073875427, + "learning_rate": 4.74404533799851e-06, + "loss": 0.5479, + "step": 24738 + }, + { + "epoch": 0.3288142230836373, + "grad_norm": 0.9028350710868835, + "learning_rate": 4.694937565287344e-06, + "loss": 0.5525, + "step": 24769 + }, + { + "epoch": 0.32922575527773446, + "grad_norm": 0.8268290758132935, + "learning_rate": 4.646058932445985e-06, + "loss": 0.5437, + "step": 24800 + }, + { + "epoch": 0.32963728747183163, + "grad_norm": 0.8609195947647095, + "learning_rate": 4.597409991063148e-06, + "loss": 0.5467, + "step": 24831 + }, + { + "epoch": 0.3300488196659288, + "grad_norm": 0.8118563890457153, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.5356, + "step": 24862 + }, + { + "epoch": 0.330460351860026, + "grad_norm": 0.8927356004714966, + "learning_rate": 4.500803376061608e-06, + "loss": 0.5473, + "step": 24893 + }, + { + "epoch": 0.3308718840541231, + "grad_norm": 0.8378262519836426, + "learning_rate": 4.45284679263541e-06, + "loss": 0.5441, + "step": 24924 + }, + { + "epoch": 0.3312834162482203, + "grad_norm": 0.8822687268257141, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.5383, + "step": 24955 + }, + { + "epoch": 0.33169494844231745, + "grad_norm": 0.8355059623718262, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.5503, + "step": 24986 + }, + { + "epoch": 0.3321064806364146, + "grad_norm": 0.8296234011650085, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.5453, + "step": 25017 + }, + { + "epoch": 0.3325180128305118, + "grad_norm": 0.9190348982810974, + "learning_rate": 4.263344549792487e-06, + "loss": 0.5459, + "step": 25048 + }, + { + "epoch": 0.332929545024609, + "grad_norm": 0.8785108923912048, + "learning_rate": 4.216552684934056e-06, + "loss": 0.5501, + "step": 25079 + }, + { + "epoch": 0.33334107721870615, + "grad_norm": 0.7996092438697815, + "learning_rate": 4.169995358453777e-06, + "loss": 0.5502, + "step": 25110 + }, + { + "epoch": 0.3337526094128033, + "grad_norm": 0.8523440957069397, + "learning_rate": 4.123673095744757e-06, + "loss": 0.5525, + "step": 25141 + }, + { + "epoch": 0.33416414160690044, + "grad_norm": 0.8970717191696167, + "learning_rate": 4.077586419547435e-06, + "loss": 0.5426, + "step": 25172 + }, + { + "epoch": 0.3345756738009976, + "grad_norm": 0.8940107822418213, + "learning_rate": 4.03173584994368e-06, + "loss": 0.5452, + "step": 25203 + }, + { + "epoch": 0.3349872059950948, + "grad_norm": 0.9721015691757202, + "learning_rate": 3.986121904350948e-06, + "loss": 0.543, + "step": 25234 + }, + { + "epoch": 0.33539873818919197, + "grad_norm": 0.8175463676452637, + "learning_rate": 3.940745097516407e-06, + "loss": 0.5425, + "step": 25265 + }, + { + "epoch": 0.33581027038328914, + "grad_norm": 0.9015626311302185, + "learning_rate": 3.89560594151116e-06, + "loss": 0.548, + "step": 25296 + }, + { + "epoch": 0.3362218025773863, + "grad_norm": 0.8499545454978943, + "learning_rate": 3.850704945724456e-06, + "loss": 0.5455, + "step": 25327 + }, + { + "epoch": 0.3366333347714835, + "grad_norm": 0.8446874618530273, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.548, + "step": 25358 + }, + { + "epoch": 0.33704486696558067, + "grad_norm": 0.905511736869812, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.5489, + "step": 25389 + }, + { + "epoch": 0.3374563991596778, + "grad_norm": 0.8190325498580933, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.5405, + "step": 25420 + }, + { + "epoch": 0.33786793135377496, + "grad_norm": 0.9458757042884827, + "learning_rate": 3.673492658361677e-06, + "loss": 0.5457, + "step": 25451 + }, + { + "epoch": 0.33827946354787214, + "grad_norm": 0.8517038226127625, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.5434, + "step": 25482 + }, + { + "epoch": 0.3386909957419693, + "grad_norm": 0.8938561677932739, + "learning_rate": 3.586328522034607e-06, + "loss": 0.5418, + "step": 25513 + }, + { + "epoch": 0.3391025279360665, + "grad_norm": 0.9029353857040405, + "learning_rate": 3.543108684200838e-06, + "loss": 0.5453, + "step": 25544 + }, + { + "epoch": 0.33951406013016366, + "grad_norm": 0.8130074143409729, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.5481, + "step": 25575 + }, + { + "epoch": 0.33992559232426084, + "grad_norm": 0.7943762540817261, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.533, + "step": 25606 + }, + { + "epoch": 0.340337124518358, + "grad_norm": 0.8528238534927368, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.5482, + "step": 25637 + }, + { + "epoch": 0.34074865671245513, + "grad_norm": 0.8990415930747986, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.5424, + "step": 25668 + }, + { + "epoch": 0.3411601889065523, + "grad_norm": 0.8352187275886536, + "learning_rate": 3.33065122541244e-06, + "loss": 0.5415, + "step": 25699 + }, + { + "epoch": 0.3415717211006495, + "grad_norm": 0.8917340636253357, + "learning_rate": 3.288891436313385e-06, + "loss": 0.5449, + "step": 25730 + }, + { + "epoch": 0.34198325329474666, + "grad_norm": 0.9237558245658875, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.5465, + "step": 25761 + }, + { + "epoch": 0.34239478548884383, + "grad_norm": 0.8852059245109558, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.5388, + "step": 25792 + }, + { + "epoch": 0.342806317682941, + "grad_norm": 0.8427268862724304, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.5467, + "step": 25823 + }, + { + "epoch": 0.3432178498770382, + "grad_norm": 0.9008413553237915, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.5529, + "step": 25854 + }, + { + "epoch": 0.34362938207113536, + "grad_norm": 0.8473567962646484, + "learning_rate": 3.0837769226467e-06, + "loss": 0.5459, + "step": 25885 + }, + { + "epoch": 0.3440409142652325, + "grad_norm": 0.8102667927742004, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.5348, + "step": 25916 + }, + { + "epoch": 0.34445244645932965, + "grad_norm": 0.8141647577285767, + "learning_rate": 3.003459147240753e-06, + "loss": 0.5457, + "step": 25947 + }, + { + "epoch": 0.34486397865342683, + "grad_norm": 0.9256607890129089, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.5431, + "step": 25978 + }, + { + "epoch": 0.345275510847524, + "grad_norm": 0.8449942469596863, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.5435, + "step": 26009 + }, + { + "epoch": 0.3456870430416212, + "grad_norm": 1.1406134366989136, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.5516, + "step": 26040 + }, + { + "epoch": 0.34609857523571835, + "grad_norm": 0.8967415690422058, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.5461, + "step": 26071 + }, + { + "epoch": 0.34651010742981553, + "grad_norm": 0.8100745677947998, + "learning_rate": 2.807016506436172e-06, + "loss": 0.5374, + "step": 26102 + }, + { + "epoch": 0.3469216396239127, + "grad_norm": 0.8833833336830139, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.5464, + "step": 26133 + }, + { + "epoch": 0.3473331718180098, + "grad_norm": 0.8851163983345032, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.5421, + "step": 26164 + }, + { + "epoch": 0.347744704012107, + "grad_norm": 0.8747218251228333, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.5388, + "step": 26195 + }, + { + "epoch": 0.3481562362062042, + "grad_norm": 0.779757022857666, + "learning_rate": 2.654367697581725e-06, + "loss": 0.5466, + "step": 26226 + }, + { + "epoch": 0.34856776840030135, + "grad_norm": 0.8362371325492859, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.5452, + "step": 26257 + }, + { + "epoch": 0.3489793005943985, + "grad_norm": 0.8213446736335754, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.5454, + "step": 26288 + }, + { + "epoch": 0.3493908327884957, + "grad_norm": 0.9033399820327759, + "learning_rate": 2.54252733160808e-06, + "loss": 0.5475, + "step": 26319 + }, + { + "epoch": 0.3498023649825929, + "grad_norm": 0.9243888258934021, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.5516, + "step": 26350 + }, + { + "epoch": 0.35021389717669005, + "grad_norm": 0.8325033783912659, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.5421, + "step": 26381 + }, + { + "epoch": 0.35062542937078717, + "grad_norm": 0.8969632387161255, + "learning_rate": 2.432967814215639e-06, + "loss": 0.5396, + "step": 26412 + }, + { + "epoch": 0.35103696156488434, + "grad_norm": 0.8884950876235962, + "learning_rate": 2.396956760238794e-06, + "loss": 0.5538, + "step": 26443 + }, + { + "epoch": 0.3514484937589815, + "grad_norm": 0.8323497176170349, + "learning_rate": 2.361200778532796e-06, + "loss": 0.5413, + "step": 26474 + }, + { + "epoch": 0.3518600259530787, + "grad_norm": 0.9132872223854065, + "learning_rate": 2.325700272599049e-06, + "loss": 0.5412, + "step": 26505 + }, + { + "epoch": 0.35227155814717587, + "grad_norm": 0.899863064289093, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.5451, + "step": 26536 + }, + { + "epoch": 0.35268309034127304, + "grad_norm": 0.7889094352722168, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.5476, + "step": 26567 + }, + { + "epoch": 0.3530946225353702, + "grad_norm": 0.8099322319030762, + "learning_rate": 2.220735601173002e-06, + "loss": 0.5474, + "step": 26598 + }, + { + "epoch": 0.35350615472946734, + "grad_norm": 0.8513230681419373, + "learning_rate": 2.186260975614382e-06, + "loss": 0.5403, + "step": 26629 + }, + { + "epoch": 0.3539176869235645, + "grad_norm": 0.8617785573005676, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.5407, + "step": 26660 + }, + { + "epoch": 0.3543292191176617, + "grad_norm": 0.8258427381515503, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.547, + "step": 26691 + }, + { + "epoch": 0.35474075131175886, + "grad_norm": 0.7984808087348938, + "learning_rate": 2.084383340238455e-06, + "loss": 0.5457, + "step": 26722 + }, + { + "epoch": 0.35515228350585604, + "grad_norm": 0.9225831627845764, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.5476, + "step": 26753 + }, + { + "epoch": 0.3555638156999532, + "grad_norm": 0.841090202331543, + "learning_rate": 2.017757276036403e-06, + "loss": 0.5404, + "step": 26784 + }, + { + "epoch": 0.3559753478940504, + "grad_norm": 0.8598360419273376, + "learning_rate": 1.984833083927726e-06, + "loss": 0.5551, + "step": 26815 + }, + { + "epoch": 0.35638688008814756, + "grad_norm": 0.9421056509017944, + "learning_rate": 1.952168614849581e-06, + "loss": 0.5493, + "step": 26846 + }, + { + "epoch": 0.3567984122822447, + "grad_norm": 0.807736873626709, + "learning_rate": 1.919764237416058e-06, + "loss": 0.5445, + "step": 26877 + }, + { + "epoch": 0.35720994447634186, + "grad_norm": 0.8544048070907593, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.5416, + "step": 26908 + }, + { + "epoch": 0.35762147667043903, + "grad_norm": 0.897087037563324, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.5405, + "step": 26939 + }, + { + "epoch": 0.3580330088645362, + "grad_norm": 0.8813446760177612, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.5546, + "step": 26970 + }, + { + "epoch": 0.3584445410586334, + "grad_norm": 0.8071566820144653, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.548, + "step": 27001 + }, + { + "epoch": 0.35885607325273056, + "grad_norm": 0.8715914487838745, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.5376, + "step": 27032 + }, + { + "epoch": 0.35926760544682773, + "grad_norm": 0.83490389585495, + "learning_rate": 1.730820169401584e-06, + "loss": 0.5474, + "step": 27063 + }, + { + "epoch": 0.3596791376409249, + "grad_norm": 0.9507847428321838, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.5471, + "step": 27094 + }, + { + "epoch": 0.360090669835022, + "grad_norm": 0.8561064004898071, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.5448, + "step": 27125 + }, + { + "epoch": 0.3605022020291192, + "grad_norm": 0.8557907342910767, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.5503, + "step": 27156 + }, + { + "epoch": 0.3609137342232164, + "grad_norm": 0.8815693259239197, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.5505, + "step": 27187 + }, + { + "epoch": 0.36132526641731355, + "grad_norm": 0.8523679375648499, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.5437, + "step": 27218 + }, + { + "epoch": 0.3617367986114107, + "grad_norm": 0.8898177742958069, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.5584, + "step": 27249 + }, + { + "epoch": 0.3621483308055079, + "grad_norm": 0.8666384220123291, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.5355, + "step": 27280 + }, + { + "epoch": 0.3625598629996051, + "grad_norm": 0.967224657535553, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.5491, + "step": 27311 + }, + { + "epoch": 0.36297139519370225, + "grad_norm": 0.9119516015052795, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.5348, + "step": 27342 + }, + { + "epoch": 0.36338292738779937, + "grad_norm": 0.9404922127723694, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.539, + "step": 27373 + }, + { + "epoch": 0.36379445958189655, + "grad_norm": 0.9016281366348267, + "learning_rate": 1.409026746485978e-06, + "loss": 0.5348, + "step": 27404 + }, + { + "epoch": 0.3642059917759937, + "grad_norm": 0.8831793069839478, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.5424, + "step": 27435 + }, + { + "epoch": 0.3646175239700909, + "grad_norm": 0.8272929191589355, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.5426, + "step": 27466 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.026490470961932e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-27468/training_args.bin b/checkpoint-27468/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-27468/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-30517/config.json b/checkpoint-30517/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-30517/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-30517/generation_config.json b/checkpoint-30517/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-30517/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-30517/model-00001-of-00007.safetensors b/checkpoint-30517/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..33d4f7e14e7a8a2afbe7b7c06a024ee7f26f0cca --- /dev/null +++ b/checkpoint-30517/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf30dec03b2688f7bed8ee5bff34fb048ce268ccc24d6c1ccf12106f6bbdd988 +size 4886466168 diff --git a/checkpoint-30517/model-00002-of-00007.safetensors b/checkpoint-30517/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-30517/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-30517/model-00003-of-00007.safetensors b/checkpoint-30517/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-30517/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-30517/model-00004-of-00007.safetensors b/checkpoint-30517/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-30517/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-30517/model-00005-of-00007.safetensors b/checkpoint-30517/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-30517/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-30517/model-00006-of-00007.safetensors b/checkpoint-30517/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c8b9d6763015190745a246c62d5b2cd6f92bbe8f --- /dev/null +++ b/checkpoint-30517/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95eb8eb3fc2bb6c30768b266b4ef96f92214652eb07eb836fe234961e211ff85 +size 4999813120 diff --git a/checkpoint-30517/model-00007-of-00007.safetensors b/checkpoint-30517/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5e21baf096d1a4fe82e5bdcf0b51239463e734f --- /dev/null +++ b/checkpoint-30517/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec98d73751d8916ceeed0ae8aa58fbd1b2a78a65b277251427c8cdd6368bd1c8 +size 2571158184 diff --git a/checkpoint-30517/model.safetensors.index.json b/checkpoint-30517/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-30517/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-30517/optimizer.pt b/checkpoint-30517/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..393217fe588bac7763028ee0487bdf33aeab8175 --- /dev/null +++ b/checkpoint-30517/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aba3dc3690e85e4ddee2fc6be3013676781b84e04b4d83f8ae44cad03f0eb63 +size 15385036334 diff --git a/checkpoint-30517/rng_state.pth b/checkpoint-30517/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-30517/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-30517/scheduler.pt b/checkpoint-30517/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d585b62f4a9bc8119a4f1d03cf2bb269e99411b --- /dev/null +++ b/checkpoint-30517/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d91c597306f26392e8da0d33a09fdcca77cfdeb5ad42248045521772fa3d64e +size 1064 diff --git a/checkpoint-30517/trainer_state.json b/checkpoint-30517/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f0f05c0c8a56187aea010a325ae8c293563e2e9d --- /dev/null +++ b/checkpoint-30517/trainer_state.json @@ -0,0 +1,6921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.40512025700849286, + "eval_steps": 500, + "global_step": 30517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + }, + { + "epoch": 0.12181352945276175, + "grad_norm": 0.9744802713394165, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6, + "step": 9176 + }, + { + "epoch": 0.12222506164685891, + "grad_norm": 1.0286784172058105, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.5913, + "step": 9207 + }, + { + "epoch": 0.12263659384095608, + "grad_norm": 1.0559006929397583, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.5999, + "step": 9238 + }, + { + "epoch": 0.12304812603505325, + "grad_norm": 1.0805801153182983, + "learning_rate": 4.170395751189495e-05, + "loss": 0.5929, + "step": 9269 + }, + { + "epoch": 0.12345965822915042, + "grad_norm": 1.088597059249878, + "learning_rate": 4.164137885110921e-05, + "loss": 0.5873, + "step": 9300 + }, + { + "epoch": 0.12387119042324758, + "grad_norm": 1.028403639793396, + "learning_rate": 4.157861239462495e-05, + "loss": 0.5939, + "step": 9331 + }, + { + "epoch": 0.12428272261734476, + "grad_norm": 0.9916087985038757, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.5932, + "step": 9362 + }, + { + "epoch": 0.12469425481144192, + "grad_norm": 1.0268062353134155, + "learning_rate": 4.145251892991588e-05, + "loss": 0.5945, + "step": 9393 + }, + { + "epoch": 0.1251057870055391, + "grad_norm": 1.046953797340393, + "learning_rate": 4.138919334463868e-05, + "loss": 0.5923, + "step": 9424 + }, + { + "epoch": 0.12551731919963627, + "grad_norm": 1.1010463237762451, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5898, + "step": 9455 + }, + { + "epoch": 0.12592885139373342, + "grad_norm": 1.1851146221160889, + "learning_rate": 4.126198804133398e-05, + "loss": 0.591, + "step": 9486 + }, + { + "epoch": 0.1263403835878306, + "grad_norm": 1.106885313987732, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5893, + "step": 9517 + }, + { + "epoch": 0.12675191578192777, + "grad_norm": 1.0527287721633911, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5937, + "step": 9548 + }, + { + "epoch": 0.12716344797602494, + "grad_norm": 0.9986059069633484, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5961, + "step": 9579 + }, + { + "epoch": 0.1275749801701221, + "grad_norm": 1.025026798248291, + "learning_rate": 4.100538104413674e-05, + "loss": 0.5853, + "step": 9610 + }, + { + "epoch": 0.12798651236421926, + "grad_norm": 1.062345027923584, + "learning_rate": 4.09407759334692e-05, + "loss": 0.5916, + "step": 9641 + }, + { + "epoch": 0.12839804455831644, + "grad_norm": 1.0635534524917603, + "learning_rate": 4.087599093331186e-05, + "loss": 0.5919, + "step": 9672 + }, + { + "epoch": 0.1288095767524136, + "grad_norm": 1.1224849224090576, + "learning_rate": 4.081102677475462e-05, + "loss": 0.5948, + "step": 9703 + }, + { + "epoch": 0.12922110894651076, + "grad_norm": 1.113202452659607, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.5807, + "step": 9734 + }, + { + "epoch": 0.12963264114060793, + "grad_norm": 1.0213284492492676, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.5813, + "step": 9765 + }, + { + "epoch": 0.1300441733347051, + "grad_norm": 1.1083142757415771, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.5856, + "step": 9796 + }, + { + "epoch": 0.13045570552880228, + "grad_norm": 1.1317702531814575, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.586, + "step": 9827 + }, + { + "epoch": 0.13086723772289943, + "grad_norm": 0.9876610040664673, + "learning_rate": 4.048354433517794e-05, + "loss": 0.5921, + "step": 9858 + }, + { + "epoch": 0.1312787699169966, + "grad_norm": 1.085672378540039, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5791, + "step": 9889 + }, + { + "epoch": 0.13169030211109378, + "grad_norm": 1.014452338218689, + "learning_rate": 4.035132306369438e-05, + "loss": 0.5921, + "step": 9920 + }, + { + "epoch": 0.13210183430519096, + "grad_norm": 1.059309720993042, + "learning_rate": 4.028495219804555e-05, + "loss": 0.5834, + "step": 9951 + }, + { + "epoch": 0.1325133664992881, + "grad_norm": 1.1058080196380615, + "learning_rate": 4.021840884378864e-05, + "loss": 0.5826, + "step": 9982 + }, + { + "epoch": 0.13292489869338528, + "grad_norm": 1.0567057132720947, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5924, + "step": 10013 + }, + { + "epoch": 0.13333643088748245, + "grad_norm": 1.0371674299240112, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.589, + "step": 10044 + }, + { + "epoch": 0.13374796308157963, + "grad_norm": 0.909173309803009, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.5978, + "step": 10075 + }, + { + "epoch": 0.13415949527567678, + "grad_norm": 1.0078933238983154, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5814, + "step": 10106 + }, + { + "epoch": 0.13457102746977395, + "grad_norm": 0.9631708264350891, + "learning_rate": 3.988313109368017e-05, + "loss": 0.587, + "step": 10137 + }, + { + "epoch": 0.13498255966387113, + "grad_norm": 1.1884409189224243, + "learning_rate": 3.981556864489504e-05, + "loss": 0.5906, + "step": 10168 + }, + { + "epoch": 0.1353940918579683, + "grad_norm": 0.9554224014282227, + "learning_rate": 3.974783900443142e-05, + "loss": 0.5888, + "step": 10199 + }, + { + "epoch": 0.13580562405206545, + "grad_norm": 1.0510096549987793, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5828, + "step": 10230 + }, + { + "epoch": 0.13621715624616262, + "grad_norm": 0.9769448637962341, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5878, + "step": 10261 + }, + { + "epoch": 0.1366286884402598, + "grad_norm": 0.9610607028007507, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5863, + "step": 10292 + }, + { + "epoch": 0.13704022063435697, + "grad_norm": 1.0102349519729614, + "learning_rate": 3.947526384030751e-05, + "loss": 0.5812, + "step": 10323 + }, + { + "epoch": 0.13745175282845412, + "grad_norm": 1.138122320175171, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.5841, + "step": 10354 + }, + { + "epoch": 0.1378632850225513, + "grad_norm": 1.0107802152633667, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.5847, + "step": 10385 + }, + { + "epoch": 0.13827481721664847, + "grad_norm": 0.9923847317695618, + "learning_rate": 3.926911459260109e-05, + "loss": 0.5971, + "step": 10416 + }, + { + "epoch": 0.13868634941074565, + "grad_norm": 1.0700715780258179, + "learning_rate": 3.920007509089102e-05, + "loss": 0.5831, + "step": 10447 + }, + { + "epoch": 0.1390978816048428, + "grad_norm": 1.1638612747192383, + "learning_rate": 3.913087534326357e-05, + "loss": 0.5849, + "step": 10478 + }, + { + "epoch": 0.13950941379893997, + "grad_norm": 1.0560393333435059, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5897, + "step": 10509 + }, + { + "epoch": 0.13992094599303714, + "grad_norm": 0.9604248404502869, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.14033247818713432, + "grad_norm": 1.0003983974456787, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5905, + "step": 10571 + }, + { + "epoch": 0.14074401038123147, + "grad_norm": 1.0378546714782715, + "learning_rate": 3.885248953871491e-05, + "loss": 0.5938, + "step": 10602 + }, + { + "epoch": 0.14115554257532864, + "grad_norm": 0.9967820644378662, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5821, + "step": 10633 + }, + { + "epoch": 0.14156707476942582, + "grad_norm": 1.029188632965088, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5882, + "step": 10664 + }, + { + "epoch": 0.141978606963523, + "grad_norm": 1.1272863149642944, + "learning_rate": 3.864205604623078e-05, + "loss": 0.5898, + "step": 10695 + }, + { + "epoch": 0.14239013915762014, + "grad_norm": 1.0054222345352173, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5873, + "step": 10726 + }, + { + "epoch": 0.1428016713517173, + "grad_norm": 1.0327833890914917, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.5842, + "step": 10757 + }, + { + "epoch": 0.1432132035458145, + "grad_norm": 1.0088285207748413, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5903, + "step": 10788 + }, + { + "epoch": 0.14362473573991166, + "grad_norm": 0.9472872018814087, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.5809, + "step": 10819 + }, + { + "epoch": 0.1440362679340088, + "grad_norm": 1.1008390188217163, + "learning_rate": 3.828826522492255e-05, + "loss": 0.5775, + "step": 10850 + }, + { + "epoch": 0.14444780012810599, + "grad_norm": 1.1251379251480103, + "learning_rate": 3.821705398930713e-05, + "loss": 0.5821, + "step": 10881 + }, + { + "epoch": 0.14485933232220316, + "grad_norm": 0.9825986623764038, + "learning_rate": 3.814569360103385e-05, + "loss": 0.5749, + "step": 10912 + }, + { + "epoch": 0.14527086451630034, + "grad_norm": 0.9303480982780457, + "learning_rate": 3.807418486539499e-05, + "loss": 0.5827, + "step": 10943 + }, + { + "epoch": 0.14568239671039748, + "grad_norm": 0.9948610663414001, + "learning_rate": 3.80025285893569e-05, + "loss": 0.598, + "step": 10974 + }, + { + "epoch": 0.14609392890449466, + "grad_norm": 1.096449851989746, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5749, + "step": 11005 + }, + { + "epoch": 0.14650546109859183, + "grad_norm": 1.022290587425232, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5824, + "step": 11036 + }, + { + "epoch": 0.146916993292689, + "grad_norm": 1.083853840827942, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5923, + "step": 11067 + }, + { + "epoch": 0.14732852548678615, + "grad_norm": 0.919503390789032, + "learning_rate": 3.771444427862192e-05, + "loss": 0.5924, + "step": 11098 + }, + { + "epoch": 0.14774005768088333, + "grad_norm": 1.0682687759399414, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5824, + "step": 11129 + }, + { + "epoch": 0.1481515898749805, + "grad_norm": 0.9599080085754395, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5765, + "step": 11160 + }, + { + "epoch": 0.14856312206907768, + "grad_norm": 0.968349039554596, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5932, + "step": 11191 + }, + { + "epoch": 0.14897465426317483, + "grad_norm": 0.9846999049186707, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5791, + "step": 11222 + }, + { + "epoch": 0.149386186457272, + "grad_norm": 1.0673292875289917, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5803, + "step": 11253 + }, + { + "epoch": 0.14979771865136918, + "grad_norm": 1.0111202001571655, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5837, + "step": 11284 + }, + { + "epoch": 0.15020925084546635, + "grad_norm": 1.0955816507339478, + "learning_rate": 3.720480432728287e-05, + "loss": 0.5822, + "step": 11315 + }, + { + "epoch": 0.1506207830395635, + "grad_norm": 0.9147946238517761, + "learning_rate": 3.71314411067092e-05, + "loss": 0.5873, + "step": 11346 + }, + { + "epoch": 0.15103231523366067, + "grad_norm": 0.9630762934684753, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5914, + "step": 11377 + }, + { + "epoch": 0.15144384742775785, + "grad_norm": 0.9123234748840332, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5784, + "step": 11408 + }, + { + "epoch": 0.15185537962185502, + "grad_norm": 1.0654667615890503, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5805, + "step": 11439 + }, + { + "epoch": 0.15226691181595217, + "grad_norm": 1.111958622932434, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5868, + "step": 11470 + }, + { + "epoch": 0.15267844401004935, + "grad_norm": 1.024422526359558, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5904, + "step": 11501 + }, + { + "epoch": 0.15308997620414652, + "grad_norm": 1.092668056488037, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.5885, + "step": 11532 + }, + { + "epoch": 0.1535015083982437, + "grad_norm": 0.9739312529563904, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5806, + "step": 11563 + }, + { + "epoch": 0.15391304059234084, + "grad_norm": 1.098699688911438, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5755, + "step": 11594 + }, + { + "epoch": 0.15432457278643802, + "grad_norm": 0.9998940825462341, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5758, + "step": 11625 + }, + { + "epoch": 0.1547361049805352, + "grad_norm": 0.9077128767967224, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5742, + "step": 11656 + }, + { + "epoch": 0.15514763717463237, + "grad_norm": 0.9274038076400757, + "learning_rate": 3.63155933997859e-05, + "loss": 0.589, + "step": 11687 + }, + { + "epoch": 0.15555916936872952, + "grad_norm": 0.9404906034469604, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5848, + "step": 11718 + }, + { + "epoch": 0.1559707015628267, + "grad_norm": 0.9896044731140137, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5917, + "step": 11749 + }, + { + "epoch": 0.15638223375692387, + "grad_norm": 1.0280394554138184, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5811, + "step": 11780 + }, + { + "epoch": 0.15679376595102104, + "grad_norm": 0.9462539553642273, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5782, + "step": 11811 + }, + { + "epoch": 0.1572052981451182, + "grad_norm": 1.047635555267334, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5792, + "step": 11842 + }, + { + "epoch": 0.15761683033921536, + "grad_norm": 0.9613581299781799, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5847, + "step": 11873 + }, + { + "epoch": 0.15802836253331254, + "grad_norm": 0.8944469094276428, + "learning_rate": 3.578830252043148e-05, + "loss": 0.5787, + "step": 11904 + }, + { + "epoch": 0.1584398947274097, + "grad_norm": 1.0031458139419556, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5827, + "step": 11935 + }, + { + "epoch": 0.15885142692150686, + "grad_norm": 0.9425063729286194, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5899, + "step": 11966 + }, + { + "epoch": 0.15926295911560404, + "grad_norm": 1.0098518133163452, + "learning_rate": 3.556047751054378e-05, + "loss": 0.5849, + "step": 11997 + }, + { + "epoch": 0.1596744913097012, + "grad_norm": 0.8891544342041016, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5772, + "step": 12028 + }, + { + "epoch": 0.16008602350379839, + "grad_norm": 1.1580991744995117, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5859, + "step": 12059 + }, + { + "epoch": 0.16049755569789553, + "grad_norm": 1.046398639678955, + "learning_rate": 3.533157994674485e-05, + "loss": 0.582, + "step": 12090 + }, + { + "epoch": 0.1609090878919927, + "grad_norm": 1.0526766777038574, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5738, + "step": 12121 + }, + { + "epoch": 0.16132062008608988, + "grad_norm": 0.9915657043457031, + "learning_rate": 3.517839718344311e-05, + "loss": 0.5867, + "step": 12152 + }, + { + "epoch": 0.16173215228018706, + "grad_norm": 0.954210102558136, + "learning_rate": 3.510163307656086e-05, + "loss": 0.578, + "step": 12183 + }, + { + "epoch": 0.1621436844742842, + "grad_norm": 0.952920138835907, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5843, + "step": 12214 + }, + { + "epoch": 0.16255521666838138, + "grad_norm": 0.9184344410896301, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5748, + "step": 12245 + }, + { + "epoch": 0.16296674886247856, + "grad_norm": 0.9797667264938354, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5788, + "step": 12276 + }, + { + "epoch": 0.16337828105657573, + "grad_norm": 0.968781054019928, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5835, + "step": 12307 + }, + { + "epoch": 0.16378981325067288, + "grad_norm": 1.0632243156433105, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5742, + "step": 12338 + }, + { + "epoch": 0.16420134544477005, + "grad_norm": 0.9538365006446838, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5737, + "step": 12369 + }, + { + "epoch": 0.16461287763886723, + "grad_norm": 0.9336950778961182, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5821, + "step": 12400 + }, + { + "epoch": 0.1650244098329644, + "grad_norm": 0.9611359238624573, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5847, + "step": 12431 + }, + { + "epoch": 0.16543594202706155, + "grad_norm": 1.0813709497451782, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5805, + "step": 12462 + }, + { + "epoch": 0.16584747422115872, + "grad_norm": 0.9780453443527222, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5761, + "step": 12493 + }, + { + "epoch": 0.1662590064152559, + "grad_norm": 0.9629074335098267, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.58, + "step": 12524 + }, + { + "epoch": 0.16667053860935307, + "grad_norm": 0.9241612553596497, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5732, + "step": 12555 + }, + { + "epoch": 0.16708207080345022, + "grad_norm": 1.0314536094665527, + "learning_rate": 3.409364314116074e-05, + "loss": 0.578, + "step": 12586 + }, + { + "epoch": 0.1674936029975474, + "grad_norm": 0.9907404184341431, + "learning_rate": 3.401536249920559e-05, + "loss": 0.591, + "step": 12617 + }, + { + "epoch": 0.16790513519164457, + "grad_norm": 0.9442338943481445, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5754, + "step": 12648 + }, + { + "epoch": 0.16831666738574175, + "grad_norm": 0.9101460576057434, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5815, + "step": 12679 + }, + { + "epoch": 0.1687281995798389, + "grad_norm": 1.029105544090271, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5766, + "step": 12710 + }, + { + "epoch": 0.16913973177393607, + "grad_norm": 1.028542160987854, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5769, + "step": 12741 + }, + { + "epoch": 0.16955126396803324, + "grad_norm": 0.9411474466323853, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5808, + "step": 12772 + }, + { + "epoch": 0.16996279616213042, + "grad_norm": 0.9260859489440918, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5683, + "step": 12803 + }, + { + "epoch": 0.17037432835622757, + "grad_norm": 0.9867289066314697, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5781, + "step": 12834 + }, + { + "epoch": 0.17078586055032474, + "grad_norm": 0.9310885071754456, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5876, + "step": 12865 + }, + { + "epoch": 0.17119739274442192, + "grad_norm": 1.1955841779708862, + "learning_rate": 3.330636493090868e-05, + "loss": 0.577, + "step": 12896 + }, + { + "epoch": 0.1716089249385191, + "grad_norm": 1.0715487003326416, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5835, + "step": 12927 + }, + { + "epoch": 0.17202045713261624, + "grad_norm": 0.9181262850761414, + "learning_rate": 3.314775593945251e-05, + "loss": 0.5733, + "step": 12958 + }, + { + "epoch": 0.17243198932671341, + "grad_norm": 0.9595162868499756, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5685, + "step": 12989 + }, + { + "epoch": 0.1728435215208106, + "grad_norm": 1.057614803314209, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5736, + "step": 13020 + }, + { + "epoch": 0.17325505371490776, + "grad_norm": 0.9654355049133301, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5782, + "step": 13051 + }, + { + "epoch": 0.1736665859090049, + "grad_norm": 1.0919837951660156, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5771, + "step": 13082 + }, + { + "epoch": 0.1740781181031021, + "grad_norm": 1.064331293106079, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5731, + "step": 13113 + }, + { + "epoch": 0.17448965029719926, + "grad_norm": 0.8971393704414368, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5855, + "step": 13144 + }, + { + "epoch": 0.17490118249129644, + "grad_norm": 0.9515472054481506, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5776, + "step": 13175 + }, + { + "epoch": 0.17531271468539358, + "grad_norm": 0.9811455607414246, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5817, + "step": 13206 + }, + { + "epoch": 0.17572424687949076, + "grad_norm": 0.9187535643577576, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5751, + "step": 13237 + }, + { + "epoch": 0.17613577907358793, + "grad_norm": 0.9396876096725464, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5744, + "step": 13268 + }, + { + "epoch": 0.1765473112676851, + "grad_norm": 1.064921259880066, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5731, + "step": 13299 + }, + { + "epoch": 0.17695884346178226, + "grad_norm": 0.946753978729248, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5824, + "step": 13330 + }, + { + "epoch": 0.17737037565587943, + "grad_norm": 0.8623449206352234, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5764, + "step": 13361 + }, + { + "epoch": 0.1777819078499766, + "grad_norm": 0.9526584148406982, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5696, + "step": 13392 + }, + { + "epoch": 0.17819344004407378, + "grad_norm": 0.9413353800773621, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5688, + "step": 13423 + }, + { + "epoch": 0.17860497223817093, + "grad_norm": 0.9612322449684143, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5694, + "step": 13454 + }, + { + "epoch": 0.1790165044322681, + "grad_norm": 1.0289851427078247, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5799, + "step": 13485 + }, + { + "epoch": 0.17942803662636528, + "grad_norm": 0.9124505519866943, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5729, + "step": 13516 + }, + { + "epoch": 0.17983956882046245, + "grad_norm": 0.9874391555786133, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5761, + "step": 13547 + }, + { + "epoch": 0.1802511010145596, + "grad_norm": 1.0058220624923706, + "learning_rate": 3.154262717052985e-05, + "loss": 0.58, + "step": 13578 + }, + { + "epoch": 0.18066263320865678, + "grad_norm": 0.9753491282463074, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5749, + "step": 13609 + }, + { + "epoch": 0.18107416540275395, + "grad_norm": 1.0167535543441772, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5848, + "step": 13640 + }, + { + "epoch": 0.18148569759685113, + "grad_norm": 0.9970638155937195, + "learning_rate": 3.129913267924946e-05, + "loss": 0.5743, + "step": 13671 + }, + { + "epoch": 0.18189722979094827, + "grad_norm": 1.0033198595046997, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5747, + "step": 13702 + }, + { + "epoch": 0.18230876198504545, + "grad_norm": 1.0107535123825073, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5725, + "step": 13733 + }, + { + "epoch": 0.18272029417914262, + "grad_norm": 0.9341425895690918, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5726, + "step": 13764 + }, + { + "epoch": 0.1831318263732398, + "grad_norm": 1.0088342428207397, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5737, + "step": 13795 + }, + { + "epoch": 0.18354335856733694, + "grad_norm": 0.8871366381645203, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.576, + "step": 13826 + }, + { + "epoch": 0.18395489076143412, + "grad_norm": 0.971552848815918, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.57, + "step": 13857 + }, + { + "epoch": 0.1843664229555313, + "grad_norm": 1.1167266368865967, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5652, + "step": 13888 + }, + { + "epoch": 0.18477795514962847, + "grad_norm": 0.9768183827400208, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5671, + "step": 13919 + }, + { + "epoch": 0.18518948734372562, + "grad_norm": 1.0097460746765137, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5771, + "step": 13950 + }, + { + "epoch": 0.1856010195378228, + "grad_norm": 1.010986089706421, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5723, + "step": 13981 + }, + { + "epoch": 0.18601255173191997, + "grad_norm": 0.8861789107322693, + "learning_rate": 3.040103481317539e-05, + "loss": 0.5798, + "step": 14012 + }, + { + "epoch": 0.18642408392601714, + "grad_norm": 0.9811322689056396, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5703, + "step": 14043 + }, + { + "epoch": 0.1868356161201143, + "grad_norm": 0.9183774590492249, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5655, + "step": 14074 + }, + { + "epoch": 0.18724714831421146, + "grad_norm": 0.9655874371528625, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.57, + "step": 14105 + }, + { + "epoch": 0.18765868050830864, + "grad_norm": 0.9735666513442993, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5748, + "step": 14136 + }, + { + "epoch": 0.18807021270240581, + "grad_norm": 0.9084784388542175, + "learning_rate": 2.999029669712431e-05, + "loss": 0.568, + "step": 14167 + }, + { + "epoch": 0.18848174489650296, + "grad_norm": 0.8866302371025085, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5768, + "step": 14198 + }, + { + "epoch": 0.18889327709060014, + "grad_norm": 0.9936773180961609, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5701, + "step": 14229 + }, + { + "epoch": 0.1893048092846973, + "grad_norm": 0.9721146821975708, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5638, + "step": 14260 + }, + { + "epoch": 0.1897163414787945, + "grad_norm": 0.921503484249115, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5645, + "step": 14291 + }, + { + "epoch": 0.19012787367289163, + "grad_norm": 0.9194900989532471, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5724, + "step": 14322 + }, + { + "epoch": 0.1905394058669888, + "grad_norm": 0.9940860271453857, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5731, + "step": 14353 + }, + { + "epoch": 0.19095093806108598, + "grad_norm": 0.9163900017738342, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5761, + "step": 14384 + }, + { + "epoch": 0.19136247025518316, + "grad_norm": 0.960118293762207, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.572, + "step": 14415 + }, + { + "epoch": 0.1917740024492803, + "grad_norm": 0.9942934513092041, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5606, + "step": 14446 + }, + { + "epoch": 0.19218553464337748, + "grad_norm": 0.9903921484947205, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5704, + "step": 14477 + }, + { + "epoch": 0.19259706683747466, + "grad_norm": 0.9734652638435364, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.5768, + "step": 14508 + }, + { + "epoch": 0.19300859903157183, + "grad_norm": 0.9251719117164612, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.565, + "step": 14539 + }, + { + "epoch": 0.19342013122566898, + "grad_norm": 0.9734142422676086, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5702, + "step": 14570 + }, + { + "epoch": 0.19383166341976615, + "grad_norm": 0.9692136645317078, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5581, + "step": 14601 + }, + { + "epoch": 0.19424319561386333, + "grad_norm": 0.9654501676559448, + "learning_rate": 2.875010077160754e-05, + "loss": 0.5667, + "step": 14632 + }, + { + "epoch": 0.1946547278079605, + "grad_norm": 1.0234603881835938, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5688, + "step": 14663 + }, + { + "epoch": 0.19506626000205765, + "grad_norm": 0.9619661569595337, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5713, + "step": 14694 + }, + { + "epoch": 0.19547779219615483, + "grad_norm": 1.0468534231185913, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5702, + "step": 14725 + }, + { + "epoch": 0.195889324390252, + "grad_norm": 0.9849717617034912, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5695, + "step": 14756 + }, + { + "epoch": 0.19630085658434918, + "grad_norm": 0.8887643218040466, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5688, + "step": 14787 + }, + { + "epoch": 0.19671238877844632, + "grad_norm": 0.9583494067192078, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5775, + "step": 14818 + }, + { + "epoch": 0.1971239209725435, + "grad_norm": 0.9766852259635925, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5688, + "step": 14849 + }, + { + "epoch": 0.19753545316664067, + "grad_norm": 0.9450570940971375, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5773, + "step": 14880 + }, + { + "epoch": 0.19794698536073785, + "grad_norm": 0.9476996064186096, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5688, + "step": 14911 + }, + { + "epoch": 0.198358517554835, + "grad_norm": 1.0239835977554321, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5717, + "step": 14942 + }, + { + "epoch": 0.19877004974893217, + "grad_norm": 0.9848045110702515, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5691, + "step": 14973 + }, + { + "epoch": 0.19918158194302935, + "grad_norm": 1.0350494384765625, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5808, + "step": 15004 + }, + { + "epoch": 0.19959311413712652, + "grad_norm": 0.8743448853492737, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5694, + "step": 15035 + }, + { + "epoch": 0.20000464633122367, + "grad_norm": 0.9410389065742493, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5739, + "step": 15066 + }, + { + "epoch": 0.20041617852532084, + "grad_norm": 1.0113860368728638, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5689, + "step": 15097 + }, + { + "epoch": 0.20082771071941802, + "grad_norm": 0.998852550983429, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.5619, + "step": 15128 + }, + { + "epoch": 0.2012392429135152, + "grad_norm": 0.9299794435501099, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5672, + "step": 15159 + }, + { + "epoch": 0.20165077510761234, + "grad_norm": 1.0109282732009888, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5691, + "step": 15190 + }, + { + "epoch": 0.20206230730170952, + "grad_norm": 0.8805022835731506, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5601, + "step": 15221 + }, + { + "epoch": 0.2024738394958067, + "grad_norm": 0.8976924419403076, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5669, + "step": 15252 + }, + { + "epoch": 0.20288537168990387, + "grad_norm": 0.8639585375785828, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5713, + "step": 15283 + }, + { + "epoch": 0.203296903884001, + "grad_norm": 0.9253800511360168, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5634, + "step": 15314 + }, + { + "epoch": 0.2037084360780982, + "grad_norm": 0.8547073006629944, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5705, + "step": 15345 + }, + { + "epoch": 0.20411996827219536, + "grad_norm": 0.8723642230033875, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.573, + "step": 15376 + }, + { + "epoch": 0.20453150046629254, + "grad_norm": 0.9164481163024902, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5645, + "step": 15407 + }, + { + "epoch": 0.20494303266038968, + "grad_norm": 0.9538819193840027, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5677, + "step": 15438 + }, + { + "epoch": 0.20535456485448686, + "grad_norm": 0.8995161652565002, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5683, + "step": 15469 + }, + { + "epoch": 0.20576609704858403, + "grad_norm": 0.9026926755905151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5694, + "step": 15500 + }, + { + "epoch": 0.2061776292426812, + "grad_norm": 0.9095093011856079, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5722, + "step": 15531 + }, + { + "epoch": 0.20658916143677836, + "grad_norm": 0.874626636505127, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5652, + "step": 15562 + }, + { + "epoch": 0.20700069363087553, + "grad_norm": 1.0359785556793213, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.57, + "step": 15593 + }, + { + "epoch": 0.2074122258249727, + "grad_norm": 0.9145928621292114, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5637, + "step": 15624 + }, + { + "epoch": 0.20782375801906988, + "grad_norm": 1.020246982574463, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5619, + "step": 15655 + }, + { + "epoch": 0.20823529021316703, + "grad_norm": 0.8766633868217468, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.5631, + "step": 15686 + }, + { + "epoch": 0.2086468224072642, + "grad_norm": 0.9841639399528503, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.5598, + "step": 15717 + }, + { + "epoch": 0.20905835460136138, + "grad_norm": 0.8983998894691467, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5723, + "step": 15748 + }, + { + "epoch": 0.20946988679545855, + "grad_norm": 0.8868324756622314, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5801, + "step": 15779 + }, + { + "epoch": 0.2098814189895557, + "grad_norm": 0.9000539183616638, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5672, + "step": 15810 + }, + { + "epoch": 0.21029295118365288, + "grad_norm": 0.9193928837776184, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.5658, + "step": 15841 + }, + { + "epoch": 0.21070448337775005, + "grad_norm": 0.9424473643302917, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5685, + "step": 15872 + }, + { + "epoch": 0.21111601557184723, + "grad_norm": 0.9552715420722961, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5725, + "step": 15903 + }, + { + "epoch": 0.21152754776594437, + "grad_norm": 0.8888420462608337, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5649, + "step": 15934 + }, + { + "epoch": 0.21193907996004155, + "grad_norm": 0.906830370426178, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5692, + "step": 15965 + }, + { + "epoch": 0.21235061215413872, + "grad_norm": 0.8939186334609985, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5665, + "step": 15996 + }, + { + "epoch": 0.2127621443482359, + "grad_norm": 1.0149410963058472, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5632, + "step": 16027 + }, + { + "epoch": 0.21317367654233305, + "grad_norm": 0.963056206703186, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5656, + "step": 16058 + }, + { + "epoch": 0.21358520873643022, + "grad_norm": 0.8071532249450684, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5667, + "step": 16089 + }, + { + "epoch": 0.2139967409305274, + "grad_norm": 0.9192640781402588, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5709, + "step": 16120 + }, + { + "epoch": 0.21440827312462457, + "grad_norm": 0.84633868932724, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5658, + "step": 16151 + }, + { + "epoch": 0.21481980531872172, + "grad_norm": 0.8883370757102966, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.5683, + "step": 16182 + }, + { + "epoch": 0.2152313375128189, + "grad_norm": 0.8919095396995544, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.568, + "step": 16213 + }, + { + "epoch": 0.21564286970691607, + "grad_norm": 0.9360633492469788, + "learning_rate": 2.439728136286796e-05, + "loss": 0.565, + "step": 16244 + }, + { + "epoch": 0.21605440190101324, + "grad_norm": 0.9496976733207703, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5627, + "step": 16275 + }, + { + "epoch": 0.2164659340951104, + "grad_norm": 0.9771477580070496, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5614, + "step": 16306 + }, + { + "epoch": 0.21687746628920757, + "grad_norm": 0.931249737739563, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5649, + "step": 16337 + }, + { + "epoch": 0.21728899848330474, + "grad_norm": 0.9592285752296448, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5646, + "step": 16368 + }, + { + "epoch": 0.21770053067740192, + "grad_norm": 0.9159988164901733, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5672, + "step": 16399 + }, + { + "epoch": 0.21811206287149906, + "grad_norm": 0.97376549243927, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5607, + "step": 16430 + }, + { + "epoch": 0.21852359506559624, + "grad_norm": 0.8469638824462891, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5597, + "step": 16461 + }, + { + "epoch": 0.2189351272596934, + "grad_norm": 1.030610203742981, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5625, + "step": 16492 + }, + { + "epoch": 0.2193466594537906, + "grad_norm": 0.9524822235107422, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5621, + "step": 16523 + }, + { + "epoch": 0.21975819164788774, + "grad_norm": 0.9608604311943054, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5672, + "step": 16554 + }, + { + "epoch": 0.2201697238419849, + "grad_norm": 0.9253712296485901, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5609, + "step": 16585 + }, + { + "epoch": 0.22058125603608209, + "grad_norm": 0.923402726650238, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5654, + "step": 16616 + }, + { + "epoch": 0.22099278823017926, + "grad_norm": 1.0030287504196167, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5576, + "step": 16647 + }, + { + "epoch": 0.2214043204242764, + "grad_norm": 0.9106613993644714, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5653, + "step": 16678 + }, + { + "epoch": 0.22181585261837358, + "grad_norm": 1.0058101415634155, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5749, + "step": 16709 + }, + { + "epoch": 0.22222738481247076, + "grad_norm": 0.931086540222168, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5637, + "step": 16740 + }, + { + "epoch": 0.22263891700656793, + "grad_norm": 0.9743716716766357, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5718, + "step": 16771 + }, + { + "epoch": 0.22305044920066508, + "grad_norm": 0.8751611709594727, + "learning_rate": 2.288805948824212e-05, + "loss": 0.5581, + "step": 16802 + }, + { + "epoch": 0.22346198139476225, + "grad_norm": 0.867038905620575, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5664, + "step": 16833 + }, + { + "epoch": 0.22387351358885943, + "grad_norm": 0.8663344383239746, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.559, + "step": 16864 + }, + { + "epoch": 0.2242850457829566, + "grad_norm": 0.984854519367218, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5626, + "step": 16895 + }, + { + "epoch": 0.22469657797705375, + "grad_norm": 0.9031103849411011, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5666, + "step": 16926 + }, + { + "epoch": 0.22510811017115093, + "grad_norm": 0.8782587647438049, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5501, + "step": 16957 + }, + { + "epoch": 0.2255196423652481, + "grad_norm": 1.0644887685775757, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.5604, + "step": 16988 + }, + { + "epoch": 0.22593117455934528, + "grad_norm": 0.8691216111183167, + "learning_rate": 2.230292185905114e-05, + "loss": 0.5649, + "step": 17019 + }, + { + "epoch": 0.22634270675344242, + "grad_norm": 0.9518167972564697, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.5598, + "step": 17050 + }, + { + "epoch": 0.2267542389475396, + "grad_norm": 0.889673113822937, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5626, + "step": 17081 + }, + { + "epoch": 0.22716577114163677, + "grad_norm": 0.9073772430419922, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5607, + "step": 17112 + }, + { + "epoch": 0.22757730333573395, + "grad_norm": 0.9674621820449829, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5651, + "step": 17143 + }, + { + "epoch": 0.2279888355298311, + "grad_norm": 0.8547524809837341, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5636, + "step": 17174 + }, + { + "epoch": 0.22840036772392827, + "grad_norm": 1.00649893283844, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5597, + "step": 17205 + }, + { + "epoch": 0.22881189991802545, + "grad_norm": 0.9329107999801636, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5705, + "step": 17236 + }, + { + "epoch": 0.22922343211212262, + "grad_norm": 1.0364869832992554, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.56, + "step": 17267 + }, + { + "epoch": 0.22963496430621977, + "grad_norm": 0.898383617401123, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5634, + "step": 17298 + }, + { + "epoch": 0.23004649650031694, + "grad_norm": 0.903266429901123, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5692, + "step": 17329 + }, + { + "epoch": 0.23045802869441412, + "grad_norm": 0.835216224193573, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5619, + "step": 17360 + }, + { + "epoch": 0.2308695608885113, + "grad_norm": 0.9033771753311157, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5661, + "step": 17391 + }, + { + "epoch": 0.23128109308260844, + "grad_norm": 0.8425393104553223, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5607, + "step": 17422 + }, + { + "epoch": 0.23169262527670562, + "grad_norm": 0.8765662908554077, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5654, + "step": 17453 + }, + { + "epoch": 0.2321041574708028, + "grad_norm": 0.8663944602012634, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5644, + "step": 17484 + }, + { + "epoch": 0.23251568966489997, + "grad_norm": 0.9837983846664429, + "learning_rate": 2.097158366805287e-05, + "loss": 0.5665, + "step": 17515 + }, + { + "epoch": 0.2329272218589971, + "grad_norm": 0.9082325100898743, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5671, + "step": 17546 + }, + { + "epoch": 0.2333387540530943, + "grad_norm": 0.9680993556976318, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5602, + "step": 17577 + }, + { + "epoch": 0.23375028624719146, + "grad_norm": 0.9881089925765991, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5682, + "step": 17608 + }, + { + "epoch": 0.23416181844128864, + "grad_norm": 0.8630657196044922, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5646, + "step": 17639 + }, + { + "epoch": 0.23457335063538579, + "grad_norm": 0.8421202301979065, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.564, + "step": 17670 + }, + { + "epoch": 0.23498488282948296, + "grad_norm": 0.8951789736747742, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5594, + "step": 17701 + }, + { + "epoch": 0.23539641502358014, + "grad_norm": 1.0024628639221191, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5541, + "step": 17732 + }, + { + "epoch": 0.2358079472176773, + "grad_norm": 0.8807896971702576, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5605, + "step": 17763 + }, + { + "epoch": 0.23621947941177446, + "grad_norm": 0.9723889827728271, + "learning_rate": 2.022757379528727e-05, + "loss": 0.559, + "step": 17794 + }, + { + "epoch": 0.23663101160587163, + "grad_norm": 0.9422227740287781, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5594, + "step": 17825 + }, + { + "epoch": 0.2370425437999688, + "grad_norm": 0.9309141039848328, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5621, + "step": 17856 + }, + { + "epoch": 0.23745407599406598, + "grad_norm": 0.8761610388755798, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5665, + "step": 17887 + }, + { + "epoch": 0.23786560818816313, + "grad_norm": 0.8991973400115967, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5726, + "step": 17918 + }, + { + "epoch": 0.2382771403822603, + "grad_norm": 0.8879802227020264, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5598, + "step": 17949 + }, + { + "epoch": 0.23868867257635748, + "grad_norm": 0.9235663414001465, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.5637, + "step": 17980 + }, + { + "epoch": 0.23910020477045466, + "grad_norm": 0.9140569567680359, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5571, + "step": 18011 + }, + { + "epoch": 0.2395117369645518, + "grad_norm": 0.933430016040802, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.561, + "step": 18042 + }, + { + "epoch": 0.23992326915864898, + "grad_norm": 0.838374674320221, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5634, + "step": 18073 + }, + { + "epoch": 0.24033480135274615, + "grad_norm": 0.9295237064361572, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5604, + "step": 18104 + }, + { + "epoch": 0.24074633354684333, + "grad_norm": 0.880237340927124, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5609, + "step": 18135 + }, + { + "epoch": 0.24115786574094047, + "grad_norm": 0.9782423973083496, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5479, + "step": 18166 + }, + { + "epoch": 0.24156939793503765, + "grad_norm": 0.97150719165802, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5624, + "step": 18197 + }, + { + "epoch": 0.24198093012913482, + "grad_norm": 0.9634605050086975, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5661, + "step": 18228 + }, + { + "epoch": 0.242392462323232, + "grad_norm": 0.8706396222114563, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5582, + "step": 18259 + }, + { + "epoch": 0.24280399451732915, + "grad_norm": 0.9348079562187195, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5661, + "step": 18290 + }, + { + "epoch": 0.24321552671142632, + "grad_norm": 0.8249440789222717, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5589, + "step": 18321 + }, + { + "epoch": 0.2436270589055235, + "grad_norm": 0.9206597208976746, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5583, + "step": 18352 + }, + { + "epoch": 0.24403859109962067, + "grad_norm": 0.8377333879470825, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5571, + "step": 18383 + }, + { + "epoch": 0.24445012329371782, + "grad_norm": 0.9113277792930603, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5633, + "step": 18414 + }, + { + "epoch": 0.244861655487815, + "grad_norm": 0.9409834742546082, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5588, + "step": 18445 + }, + { + "epoch": 0.24527318768191217, + "grad_norm": 0.9693152904510498, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5621, + "step": 18476 + }, + { + "epoch": 0.24568471987600934, + "grad_norm": 0.9358701705932617, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5654, + "step": 18507 + }, + { + "epoch": 0.2460962520701065, + "grad_norm": 0.9669011831283569, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5605, + "step": 18538 + }, + { + "epoch": 0.24650778426420367, + "grad_norm": 0.9862536191940308, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5625, + "step": 18569 + }, + { + "epoch": 0.24691931645830084, + "grad_norm": 1.069492220878601, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5632, + "step": 18600 + }, + { + "epoch": 0.24733084865239802, + "grad_norm": 0.9141196608543396, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5599, + "step": 18631 + }, + { + "epoch": 0.24774238084649516, + "grad_norm": 0.8525174856185913, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5598, + "step": 18662 + }, + { + "epoch": 0.24815391304059234, + "grad_norm": 0.9469859600067139, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5554, + "step": 18693 + }, + { + "epoch": 0.2485654452346895, + "grad_norm": 0.9280170202255249, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5626, + "step": 18724 + }, + { + "epoch": 0.2489769774287867, + "grad_norm": 0.868431806564331, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5636, + "step": 18755 + }, + { + "epoch": 0.24938850962288384, + "grad_norm": 0.9638091921806335, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5561, + "step": 18786 + }, + { + "epoch": 0.249800041816981, + "grad_norm": 0.9236721396446228, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5645, + "step": 18817 + }, + { + "epoch": 0.2502115740110782, + "grad_norm": 0.8757562041282654, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5549, + "step": 18848 + }, + { + "epoch": 0.25062310620517536, + "grad_norm": 0.9709885120391846, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5613, + "step": 18879 + }, + { + "epoch": 0.25103463839927254, + "grad_norm": 0.9142551422119141, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5452, + "step": 18910 + }, + { + "epoch": 0.2514461705933697, + "grad_norm": 1.4749113321304321, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.556, + "step": 18941 + }, + { + "epoch": 0.25185770278746683, + "grad_norm": 0.8948887586593628, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5551, + "step": 18972 + }, + { + "epoch": 0.252269234981564, + "grad_norm": 0.8812825679779053, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.5549, + "step": 19003 + }, + { + "epoch": 0.2526807671756612, + "grad_norm": 0.8759215474128723, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5517, + "step": 19034 + }, + { + "epoch": 0.25309229936975836, + "grad_norm": 0.8355596661567688, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5651, + "step": 19065 + }, + { + "epoch": 0.25350383156385553, + "grad_norm": 0.9597409963607788, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5636, + "step": 19096 + }, + { + "epoch": 0.2539153637579527, + "grad_norm": 0.9418185949325562, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5551, + "step": 19127 + }, + { + "epoch": 0.2543268959520499, + "grad_norm": 0.9069491028785706, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5628, + "step": 19158 + }, + { + "epoch": 0.25473842814614706, + "grad_norm": 0.8908203840255737, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.561, + "step": 19189 + }, + { + "epoch": 0.2551499603402442, + "grad_norm": 0.8831518888473511, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5595, + "step": 19220 + }, + { + "epoch": 0.25556149253434135, + "grad_norm": 1.0363459587097168, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5555, + "step": 19251 + }, + { + "epoch": 0.2559730247284385, + "grad_norm": 0.8746747970581055, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5581, + "step": 19282 + }, + { + "epoch": 0.2563845569225357, + "grad_norm": 0.7980934381484985, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5537, + "step": 19313 + }, + { + "epoch": 0.2567960891166329, + "grad_norm": 0.851966142654419, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5513, + "step": 19344 + }, + { + "epoch": 0.25720762131073005, + "grad_norm": 0.9124501347541809, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5549, + "step": 19375 + }, + { + "epoch": 0.2576191535048272, + "grad_norm": 1.0416783094406128, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5596, + "step": 19406 + }, + { + "epoch": 0.2580306856989244, + "grad_norm": 0.9024292826652527, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5506, + "step": 19437 + }, + { + "epoch": 0.2584422178930215, + "grad_norm": 0.9234741926193237, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5549, + "step": 19468 + }, + { + "epoch": 0.2588537500871187, + "grad_norm": 0.8676049113273621, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5563, + "step": 19499 + }, + { + "epoch": 0.25926528228121587, + "grad_norm": 0.9481212496757507, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5578, + "step": 19530 + }, + { + "epoch": 0.25967681447531304, + "grad_norm": 0.8709908723831177, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5568, + "step": 19561 + }, + { + "epoch": 0.2600883466694102, + "grad_norm": 0.938412606716156, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5568, + "step": 19592 + }, + { + "epoch": 0.2604998788635074, + "grad_norm": 0.8912078142166138, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.552, + "step": 19623 + }, + { + "epoch": 0.26091141105760457, + "grad_norm": 1.1832647323608398, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5581, + "step": 19654 + }, + { + "epoch": 0.26132294325170174, + "grad_norm": 0.9237463474273682, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5592, + "step": 19685 + }, + { + "epoch": 0.26173447544579886, + "grad_norm": 0.878738522529602, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5557, + "step": 19716 + }, + { + "epoch": 0.26214600763989604, + "grad_norm": 0.9652629494667053, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5542, + "step": 19747 + }, + { + "epoch": 0.2625575398339932, + "grad_norm": 0.9157405495643616, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5609, + "step": 19778 + }, + { + "epoch": 0.2629690720280904, + "grad_norm": 0.840957760810852, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5521, + "step": 19809 + }, + { + "epoch": 0.26338060422218756, + "grad_norm": 0.8824605941772461, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5539, + "step": 19840 + }, + { + "epoch": 0.26379213641628474, + "grad_norm": 0.9319818615913391, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.559, + "step": 19871 + }, + { + "epoch": 0.2642036686103819, + "grad_norm": 0.8822436332702637, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5565, + "step": 19902 + }, + { + "epoch": 0.2646152008044791, + "grad_norm": 0.8802869915962219, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.5603, + "step": 19933 + }, + { + "epoch": 0.2650267329985762, + "grad_norm": 0.913989245891571, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5519, + "step": 19964 + }, + { + "epoch": 0.2654382651926734, + "grad_norm": 0.8885793089866638, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.5567, + "step": 19995 + }, + { + "epoch": 0.26584979738677056, + "grad_norm": 0.8809658885002136, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5516, + "step": 20026 + }, + { + "epoch": 0.26626132958086773, + "grad_norm": 0.9053296446800232, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5573, + "step": 20057 + }, + { + "epoch": 0.2666728617749649, + "grad_norm": 0.8977755904197693, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5496, + "step": 20088 + }, + { + "epoch": 0.2670843939690621, + "grad_norm": 0.935563325881958, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.558, + "step": 20119 + }, + { + "epoch": 0.26749592616315926, + "grad_norm": 1.0321307182312012, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.557, + "step": 20150 + }, + { + "epoch": 0.26790745835725643, + "grad_norm": 0.8926151990890503, + "learning_rate": 1.410916653306954e-05, + "loss": 0.556, + "step": 20181 + }, + { + "epoch": 0.26831899055135355, + "grad_norm": 0.9870996475219727, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5522, + "step": 20212 + }, + { + "epoch": 0.26873052274545073, + "grad_norm": 0.8782408237457275, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.548, + "step": 20243 + }, + { + "epoch": 0.2691420549395479, + "grad_norm": 0.887537956237793, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5548, + "step": 20274 + }, + { + "epoch": 0.2695535871336451, + "grad_norm": 0.9209414720535278, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5531, + "step": 20305 + }, + { + "epoch": 0.26996511932774225, + "grad_norm": 0.8398643732070923, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5582, + "step": 20336 + }, + { + "epoch": 0.27037665152183943, + "grad_norm": 0.9261983036994934, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5534, + "step": 20367 + }, + { + "epoch": 0.2707881837159366, + "grad_norm": 0.9387017488479614, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5536, + "step": 20398 + }, + { + "epoch": 0.2711997159100338, + "grad_norm": 0.9599831700325012, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.552, + "step": 20429 + }, + { + "epoch": 0.2716112481041309, + "grad_norm": 0.8976027965545654, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5582, + "step": 20460 + }, + { + "epoch": 0.2720227802982281, + "grad_norm": 0.890676736831665, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5597, + "step": 20491 + }, + { + "epoch": 0.27243431249232525, + "grad_norm": 0.8950179219245911, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5526, + "step": 20522 + }, + { + "epoch": 0.2728458446864224, + "grad_norm": 0.9863470792770386, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5497, + "step": 20553 + }, + { + "epoch": 0.2732573768805196, + "grad_norm": 0.9474931359291077, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5506, + "step": 20584 + }, + { + "epoch": 0.2736689090746168, + "grad_norm": 0.9262164831161499, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5517, + "step": 20615 + }, + { + "epoch": 0.27408044126871395, + "grad_norm": 0.8490736484527588, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.546, + "step": 20646 + }, + { + "epoch": 0.2744919734628111, + "grad_norm": 0.8594829440116882, + "learning_rate": 1.291596270869846e-05, + "loss": 0.554, + "step": 20677 + }, + { + "epoch": 0.27490350565690824, + "grad_norm": 0.8383352756500244, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5513, + "step": 20708 + }, + { + "epoch": 0.2753150378510054, + "grad_norm": 0.8765247464179993, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5462, + "step": 20739 + }, + { + "epoch": 0.2757265700451026, + "grad_norm": 0.856604814529419, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.562, + "step": 20770 + }, + { + "epoch": 0.27613810223919977, + "grad_norm": 0.8549590706825256, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5518, + "step": 20801 + }, + { + "epoch": 0.27654963443329694, + "grad_norm": 0.9898308515548706, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5499, + "step": 20832 + }, + { + "epoch": 0.2769611666273941, + "grad_norm": 0.8919757604598999, + "learning_rate": 1.247732733176724e-05, + "loss": 0.55, + "step": 20863 + }, + { + "epoch": 0.2773726988214913, + "grad_norm": 0.8670758008956909, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5549, + "step": 20894 + }, + { + "epoch": 0.27778423101558847, + "grad_norm": 0.822809636592865, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.5445, + "step": 20925 + }, + { + "epoch": 0.2781957632096856, + "grad_norm": 0.8837505578994751, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5543, + "step": 20956 + }, + { + "epoch": 0.27860729540378276, + "grad_norm": 0.8370216488838196, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5463, + "step": 20987 + }, + { + "epoch": 0.27901882759787994, + "grad_norm": 0.8596381545066833, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.556, + "step": 21018 + }, + { + "epoch": 0.2794303597919771, + "grad_norm": 0.9435930848121643, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.5539, + "step": 21049 + }, + { + "epoch": 0.2798418919860743, + "grad_norm": 0.8696517944335938, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5573, + "step": 21080 + }, + { + "epoch": 0.28025342418017146, + "grad_norm": 0.9277540445327759, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5557, + "step": 21111 + }, + { + "epoch": 0.28066495637426864, + "grad_norm": 0.8744814395904541, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5568, + "step": 21142 + }, + { + "epoch": 0.2810764885683658, + "grad_norm": 1.0164190530776978, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5433, + "step": 21173 + }, + { + "epoch": 0.28148802076246293, + "grad_norm": 0.8906095623970032, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.5493, + "step": 21204 + }, + { + "epoch": 0.2818995529565601, + "grad_norm": 0.8932943940162659, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.551, + "step": 21235 + }, + { + "epoch": 0.2823110851506573, + "grad_norm": 0.9328072667121887, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5543, + "step": 21266 + }, + { + "epoch": 0.28272261734475446, + "grad_norm": 0.8685097694396973, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5618, + "step": 21297 + }, + { + "epoch": 0.28313414953885163, + "grad_norm": 0.8566640615463257, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.5512, + "step": 21328 + }, + { + "epoch": 0.2835456817329488, + "grad_norm": 0.8968601226806641, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5429, + "step": 21359 + }, + { + "epoch": 0.283957213927046, + "grad_norm": 0.8937885761260986, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.5522, + "step": 21390 + }, + { + "epoch": 0.28436874612114316, + "grad_norm": 0.9389865398406982, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5566, + "step": 21421 + }, + { + "epoch": 0.2847802783152403, + "grad_norm": 0.9788251519203186, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5532, + "step": 21452 + }, + { + "epoch": 0.28519181050933745, + "grad_norm": 0.8652181029319763, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5524, + "step": 21483 + }, + { + "epoch": 0.2856033427034346, + "grad_norm": 0.9210936427116394, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5529, + "step": 21514 + }, + { + "epoch": 0.2860148748975318, + "grad_norm": 0.9828045964241028, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5513, + "step": 21545 + }, + { + "epoch": 0.286426407091629, + "grad_norm": 0.868962287902832, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5467, + "step": 21576 + }, + { + "epoch": 0.28683793928572615, + "grad_norm": 0.8329687714576721, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5542, + "step": 21607 + }, + { + "epoch": 0.2872494714798233, + "grad_norm": 0.7887142300605774, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5417, + "step": 21638 + }, + { + "epoch": 0.2876610036739205, + "grad_norm": 0.8512480854988098, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5538, + "step": 21669 + }, + { + "epoch": 0.2880725358680176, + "grad_norm": 0.9043695330619812, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5494, + "step": 21700 + }, + { + "epoch": 0.2884840680621148, + "grad_norm": 0.9565821886062622, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5489, + "step": 21731 + }, + { + "epoch": 0.28889560025621197, + "grad_norm": 0.8471581935882568, + "learning_rate": 1.043211714185722e-05, + "loss": 0.5512, + "step": 21762 + }, + { + "epoch": 0.28930713245030915, + "grad_norm": 0.9377114176750183, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5526, + "step": 21793 + }, + { + "epoch": 0.2897186646444063, + "grad_norm": 0.9441999793052673, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.5503, + "step": 21824 + }, + { + "epoch": 0.2901301968385035, + "grad_norm": 0.9086009860038757, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.545, + "step": 21855 + }, + { + "epoch": 0.29054172903260067, + "grad_norm": 0.889674961566925, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5569, + "step": 21886 + }, + { + "epoch": 0.29095326122669785, + "grad_norm": 0.8675930500030518, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5547, + "step": 21917 + }, + { + "epoch": 0.29136479342079497, + "grad_norm": 0.8342081904411316, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5461, + "step": 21948 + }, + { + "epoch": 0.29177632561489214, + "grad_norm": 0.9048583507537842, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5457, + "step": 21979 + }, + { + "epoch": 0.2921878578089893, + "grad_norm": 0.9375602602958679, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5568, + "step": 22010 + }, + { + "epoch": 0.2925993900030865, + "grad_norm": 0.8803778886795044, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5503, + "step": 22041 + }, + { + "epoch": 0.29301092219718367, + "grad_norm": 0.8693305850028992, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5485, + "step": 22072 + }, + { + "epoch": 0.29342245439128084, + "grad_norm": 0.8868476748466492, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5588, + "step": 22103 + }, + { + "epoch": 0.293833986585378, + "grad_norm": 0.8572340607643127, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5507, + "step": 22134 + }, + { + "epoch": 0.2942455187794752, + "grad_norm": 0.922905445098877, + "learning_rate": 9.559031610258007e-06, + "loss": 0.5478, + "step": 22165 + }, + { + "epoch": 0.2946570509735723, + "grad_norm": 0.8140031695365906, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5467, + "step": 22196 + }, + { + "epoch": 0.2950685831676695, + "grad_norm": 0.8945645093917847, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5478, + "step": 22227 + }, + { + "epoch": 0.29548011536176666, + "grad_norm": 0.8615440726280212, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5522, + "step": 22258 + }, + { + "epoch": 0.29589164755586383, + "grad_norm": 0.8636476993560791, + "learning_rate": 9.29623762843734e-06, + "loss": 0.548, + "step": 22289 + }, + { + "epoch": 0.296303179749961, + "grad_norm": 0.842241108417511, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5467, + "step": 22320 + }, + { + "epoch": 0.2967147119440582, + "grad_norm": 0.8380717039108276, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5438, + "step": 22351 + }, + { + "epoch": 0.29712624413815536, + "grad_norm": 0.8693488240242004, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5523, + "step": 22382 + }, + { + "epoch": 0.29753777633225253, + "grad_norm": 0.9079211354255676, + "learning_rate": 9.036279043803565e-06, + "loss": 0.5499, + "step": 22413 + }, + { + "epoch": 0.29794930852634965, + "grad_norm": 0.7630789875984192, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5529, + "step": 22444 + }, + { + "epoch": 0.29836084072044683, + "grad_norm": 0.886512279510498, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5463, + "step": 22475 + }, + { + "epoch": 0.298772372914544, + "grad_norm": 0.7641420364379883, + "learning_rate": 8.843199089695293e-06, + "loss": 0.5389, + "step": 22506 + }, + { + "epoch": 0.2991839051086412, + "grad_norm": 0.8912153244018555, + "learning_rate": 8.779202793251311e-06, + "loss": 0.5457, + "step": 22537 + }, + { + "epoch": 0.29959543730273835, + "grad_norm": 0.9104102849960327, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5482, + "step": 22568 + }, + { + "epoch": 0.30000696949683553, + "grad_norm": 0.9206966161727905, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5481, + "step": 22599 + }, + { + "epoch": 0.3004185016909327, + "grad_norm": 0.8885296583175659, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5462, + "step": 22630 + }, + { + "epoch": 0.3008300338850299, + "grad_norm": 0.8395354747772217, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5463, + "step": 22661 + }, + { + "epoch": 0.301241566079127, + "grad_norm": 0.9492244124412537, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5515, + "step": 22692 + }, + { + "epoch": 0.3016530982732242, + "grad_norm": 0.8974335789680481, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5512, + "step": 22723 + }, + { + "epoch": 0.30206463046732135, + "grad_norm": 0.8450007438659668, + "learning_rate": 8.336394285228017e-06, + "loss": 0.549, + "step": 22754 + }, + { + "epoch": 0.3024761626614185, + "grad_norm": 0.8842496275901794, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5426, + "step": 22785 + }, + { + "epoch": 0.3028876948555157, + "grad_norm": 0.875068724155426, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5478, + "step": 22816 + }, + { + "epoch": 0.3032992270496129, + "grad_norm": 0.8835846185684204, + "learning_rate": 8.149425683199823e-06, + "loss": 0.5521, + "step": 22847 + }, + { + "epoch": 0.30371075924371005, + "grad_norm": 0.9190506935119629, + "learning_rate": 8.08748219313325e-06, + "loss": 0.55, + "step": 22878 + }, + { + "epoch": 0.3041222914378072, + "grad_norm": 0.9006677269935608, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5405, + "step": 22909 + }, + { + "epoch": 0.30453382363190434, + "grad_norm": 0.9302480220794678, + "learning_rate": 7.964168476264508e-06, + "loss": 0.5591, + "step": 22940 + }, + { + "epoch": 0.3049453558260015, + "grad_norm": 0.9019137024879456, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5521, + "step": 22971 + }, + { + "epoch": 0.3053568880200987, + "grad_norm": 0.9111758470535278, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5494, + "step": 23002 + }, + { + "epoch": 0.30576842021419587, + "grad_norm": 0.8428525924682617, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5395, + "step": 23033 + }, + { + "epoch": 0.30617995240829304, + "grad_norm": 0.8785557150840759, + "learning_rate": 7.719853532125227e-06, + "loss": 0.5401, + "step": 23064 + }, + { + "epoch": 0.3065914846023902, + "grad_norm": 0.893214225769043, + "learning_rate": 7.65926058865258e-06, + "loss": 0.5503, + "step": 23095 + }, + { + "epoch": 0.3070030167964874, + "grad_norm": 0.9266390800476074, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5415, + "step": 23126 + }, + { + "epoch": 0.30741454899058457, + "grad_norm": 0.839297354221344, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5501, + "step": 23157 + }, + { + "epoch": 0.3078260811846817, + "grad_norm": 0.8542027473449707, + "learning_rate": 7.478658609642211e-06, + "loss": 0.5522, + "step": 23188 + }, + { + "epoch": 0.30823761337877886, + "grad_norm": 0.9187499284744263, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5499, + "step": 23219 + }, + { + "epoch": 0.30864914557287604, + "grad_norm": 0.8777310252189636, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5475, + "step": 23250 + }, + { + "epoch": 0.3090606777669732, + "grad_norm": 0.859711229801178, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5487, + "step": 23281 + }, + { + "epoch": 0.3094722099610704, + "grad_norm": 0.8032732009887695, + "learning_rate": 7.240627257831847e-06, + "loss": 0.545, + "step": 23312 + }, + { + "epoch": 0.30988374215516756, + "grad_norm": 0.7653436064720154, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.5536, + "step": 23343 + }, + { + "epoch": 0.31029527434926474, + "grad_norm": 0.8717504739761353, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5411, + "step": 23374 + }, + { + "epoch": 0.3107068065433619, + "grad_norm": 0.8277837634086609, + "learning_rate": 7.064205712766226e-06, + "loss": 0.5503, + "step": 23405 + }, + { + "epoch": 0.31111833873745903, + "grad_norm": 0.781703770160675, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5464, + "step": 23436 + }, + { + "epoch": 0.3115298709315562, + "grad_norm": 0.8515232801437378, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5556, + "step": 23467 + }, + { + "epoch": 0.3119414031256534, + "grad_norm": 0.9343826770782471, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5514, + "step": 23498 + }, + { + "epoch": 0.31235293531975056, + "grad_norm": 0.9239291548728943, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5414, + "step": 23529 + }, + { + "epoch": 0.31276446751384773, + "grad_norm": 0.8628037571907043, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5486, + "step": 23560 + }, + { + "epoch": 0.3131759997079449, + "grad_norm": 0.8857805728912354, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5426, + "step": 23591 + }, + { + "epoch": 0.3135875319020421, + "grad_norm": 0.8357077836990356, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5494, + "step": 23622 + }, + { + "epoch": 0.31399906409613926, + "grad_norm": 0.8405023813247681, + "learning_rate": 6.602702003200872e-06, + "loss": 0.547, + "step": 23653 + }, + { + "epoch": 0.3144105962902364, + "grad_norm": 0.9647945165634155, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5479, + "step": 23684 + }, + { + "epoch": 0.31482212848433355, + "grad_norm": 0.9345009326934814, + "learning_rate": 6.489389252651057e-06, + "loss": 0.542, + "step": 23715 + }, + { + "epoch": 0.3152336606784307, + "grad_norm": 0.9495857954025269, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.547, + "step": 23746 + }, + { + "epoch": 0.3156451928725279, + "grad_norm": 0.888819694519043, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5417, + "step": 23777 + }, + { + "epoch": 0.3160567250666251, + "grad_norm": 0.8969824910163879, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5467, + "step": 23808 + }, + { + "epoch": 0.31646825726072225, + "grad_norm": 0.8562204241752625, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5507, + "step": 23839 + }, + { + "epoch": 0.3168797894548194, + "grad_norm": 0.9174118638038635, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5508, + "step": 23870 + }, + { + "epoch": 0.3172913216489166, + "grad_norm": 0.872319221496582, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5468, + "step": 23901 + }, + { + "epoch": 0.3177028538430137, + "grad_norm": 0.8186289668083191, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5433, + "step": 23932 + }, + { + "epoch": 0.3181143860371109, + "grad_norm": 0.8317052125930786, + "learning_rate": 6.044544397429958e-06, + "loss": 0.555, + "step": 23963 + }, + { + "epoch": 0.31852591823120807, + "grad_norm": 0.8226687908172607, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5444, + "step": 23994 + }, + { + "epoch": 0.31893745042530525, + "grad_norm": 0.9374111890792847, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5504, + "step": 24025 + }, + { + "epoch": 0.3193489826194024, + "grad_norm": 0.9114209413528442, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5436, + "step": 24056 + }, + { + "epoch": 0.3197605148134996, + "grad_norm": 0.8481084704399109, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5483, + "step": 24087 + }, + { + "epoch": 0.32017204700759677, + "grad_norm": 0.8786484599113464, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5528, + "step": 24118 + }, + { + "epoch": 0.32058357920169395, + "grad_norm": 0.8430096507072449, + "learning_rate": 5.719877233394228e-06, + "loss": 0.5422, + "step": 24149 + }, + { + "epoch": 0.32099511139579107, + "grad_norm": 0.870892345905304, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5465, + "step": 24180 + }, + { + "epoch": 0.32140664358988824, + "grad_norm": 0.924968957901001, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5462, + "step": 24211 + }, + { + "epoch": 0.3218181757839854, + "grad_norm": 0.8519983887672424, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5453, + "step": 24242 + }, + { + "epoch": 0.3222297079780826, + "grad_norm": 0.8353081345558167, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5429, + "step": 24273 + }, + { + "epoch": 0.32264124017217977, + "grad_norm": 0.9054728746414185, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.5519, + "step": 24304 + }, + { + "epoch": 0.32305277236627694, + "grad_norm": 0.9061859250068665, + "learning_rate": 5.403042459894597e-06, + "loss": 0.5458, + "step": 24335 + }, + { + "epoch": 0.3234643045603741, + "grad_norm": 0.7874587774276733, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5478, + "step": 24366 + }, + { + "epoch": 0.3238758367544713, + "grad_norm": 0.8362119793891907, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5465, + "step": 24397 + }, + { + "epoch": 0.3242873689485684, + "grad_norm": 0.8235510587692261, + "learning_rate": 5.247602567671625e-06, + "loss": 0.5473, + "step": 24428 + }, + { + "epoch": 0.3246989011426656, + "grad_norm": 0.8052466511726379, + "learning_rate": 5.196234299402603e-06, + "loss": 0.5463, + "step": 24459 + }, + { + "epoch": 0.32511043333676276, + "grad_norm": 0.8947206139564514, + "learning_rate": 5.145089513937865e-06, + "loss": 0.5472, + "step": 24490 + }, + { + "epoch": 0.32552196553085994, + "grad_norm": 0.8857723474502563, + "learning_rate": 5.094168788439369e-06, + "loss": 0.5536, + "step": 24521 + }, + { + "epoch": 0.3259334977249571, + "grad_norm": 0.8428781032562256, + "learning_rate": 5.043472697540594e-06, + "loss": 0.5385, + "step": 24552 + }, + { + "epoch": 0.3263450299190543, + "grad_norm": 0.8772375583648682, + "learning_rate": 4.993001813340012e-06, + "loss": 0.5472, + "step": 24583 + }, + { + "epoch": 0.32675656211315146, + "grad_norm": 0.8726216554641724, + "learning_rate": 4.942756705394702e-06, + "loss": 0.5444, + "step": 24614 + }, + { + "epoch": 0.32716809430724864, + "grad_norm": 0.8458380699157715, + "learning_rate": 4.892737940713884e-06, + "loss": 0.5467, + "step": 24645 + }, + { + "epoch": 0.32757962650134576, + "grad_norm": 0.8258609175682068, + "learning_rate": 4.842946083752511e-06, + "loss": 0.5415, + "step": 24676 + }, + { + "epoch": 0.32799115869544293, + "grad_norm": 0.8285905122756958, + "learning_rate": 4.79338169640493e-06, + "loss": 0.5359, + "step": 24707 + }, + { + "epoch": 0.3284026908895401, + "grad_norm": 0.8557586073875427, + "learning_rate": 4.74404533799851e-06, + "loss": 0.5479, + "step": 24738 + }, + { + "epoch": 0.3288142230836373, + "grad_norm": 0.9028350710868835, + "learning_rate": 4.694937565287344e-06, + "loss": 0.5525, + "step": 24769 + }, + { + "epoch": 0.32922575527773446, + "grad_norm": 0.8268290758132935, + "learning_rate": 4.646058932445985e-06, + "loss": 0.5437, + "step": 24800 + }, + { + "epoch": 0.32963728747183163, + "grad_norm": 0.8609195947647095, + "learning_rate": 4.597409991063148e-06, + "loss": 0.5467, + "step": 24831 + }, + { + "epoch": 0.3300488196659288, + "grad_norm": 0.8118563890457153, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.5356, + "step": 24862 + }, + { + "epoch": 0.330460351860026, + "grad_norm": 0.8927356004714966, + "learning_rate": 4.500803376061608e-06, + "loss": 0.5473, + "step": 24893 + }, + { + "epoch": 0.3308718840541231, + "grad_norm": 0.8378262519836426, + "learning_rate": 4.45284679263541e-06, + "loss": 0.5441, + "step": 24924 + }, + { + "epoch": 0.3312834162482203, + "grad_norm": 0.8822687268257141, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.5383, + "step": 24955 + }, + { + "epoch": 0.33169494844231745, + "grad_norm": 0.8355059623718262, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.5503, + "step": 24986 + }, + { + "epoch": 0.3321064806364146, + "grad_norm": 0.8296234011650085, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.5453, + "step": 25017 + }, + { + "epoch": 0.3325180128305118, + "grad_norm": 0.9190348982810974, + "learning_rate": 4.263344549792487e-06, + "loss": 0.5459, + "step": 25048 + }, + { + "epoch": 0.332929545024609, + "grad_norm": 0.8785108923912048, + "learning_rate": 4.216552684934056e-06, + "loss": 0.5501, + "step": 25079 + }, + { + "epoch": 0.33334107721870615, + "grad_norm": 0.7996092438697815, + "learning_rate": 4.169995358453777e-06, + "loss": 0.5502, + "step": 25110 + }, + { + "epoch": 0.3337526094128033, + "grad_norm": 0.8523440957069397, + "learning_rate": 4.123673095744757e-06, + "loss": 0.5525, + "step": 25141 + }, + { + "epoch": 0.33416414160690044, + "grad_norm": 0.8970717191696167, + "learning_rate": 4.077586419547435e-06, + "loss": 0.5426, + "step": 25172 + }, + { + "epoch": 0.3345756738009976, + "grad_norm": 0.8940107822418213, + "learning_rate": 4.03173584994368e-06, + "loss": 0.5452, + "step": 25203 + }, + { + "epoch": 0.3349872059950948, + "grad_norm": 0.9721015691757202, + "learning_rate": 3.986121904350948e-06, + "loss": 0.543, + "step": 25234 + }, + { + "epoch": 0.33539873818919197, + "grad_norm": 0.8175463676452637, + "learning_rate": 3.940745097516407e-06, + "loss": 0.5425, + "step": 25265 + }, + { + "epoch": 0.33581027038328914, + "grad_norm": 0.9015626311302185, + "learning_rate": 3.89560594151116e-06, + "loss": 0.548, + "step": 25296 + }, + { + "epoch": 0.3362218025773863, + "grad_norm": 0.8499545454978943, + "learning_rate": 3.850704945724456e-06, + "loss": 0.5455, + "step": 25327 + }, + { + "epoch": 0.3366333347714835, + "grad_norm": 0.8446874618530273, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.548, + "step": 25358 + }, + { + "epoch": 0.33704486696558067, + "grad_norm": 0.905511736869812, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.5489, + "step": 25389 + }, + { + "epoch": 0.3374563991596778, + "grad_norm": 0.8190325498580933, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.5405, + "step": 25420 + }, + { + "epoch": 0.33786793135377496, + "grad_norm": 0.9458757042884827, + "learning_rate": 3.673492658361677e-06, + "loss": 0.5457, + "step": 25451 + }, + { + "epoch": 0.33827946354787214, + "grad_norm": 0.8517038226127625, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.5434, + "step": 25482 + }, + { + "epoch": 0.3386909957419693, + "grad_norm": 0.8938561677932739, + "learning_rate": 3.586328522034607e-06, + "loss": 0.5418, + "step": 25513 + }, + { + "epoch": 0.3391025279360665, + "grad_norm": 0.9029353857040405, + "learning_rate": 3.543108684200838e-06, + "loss": 0.5453, + "step": 25544 + }, + { + "epoch": 0.33951406013016366, + "grad_norm": 0.8130074143409729, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.5481, + "step": 25575 + }, + { + "epoch": 0.33992559232426084, + "grad_norm": 0.7943762540817261, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.533, + "step": 25606 + }, + { + "epoch": 0.340337124518358, + "grad_norm": 0.8528238534927368, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.5482, + "step": 25637 + }, + { + "epoch": 0.34074865671245513, + "grad_norm": 0.8990415930747986, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.5424, + "step": 25668 + }, + { + "epoch": 0.3411601889065523, + "grad_norm": 0.8352187275886536, + "learning_rate": 3.33065122541244e-06, + "loss": 0.5415, + "step": 25699 + }, + { + "epoch": 0.3415717211006495, + "grad_norm": 0.8917340636253357, + "learning_rate": 3.288891436313385e-06, + "loss": 0.5449, + "step": 25730 + }, + { + "epoch": 0.34198325329474666, + "grad_norm": 0.9237558245658875, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.5465, + "step": 25761 + }, + { + "epoch": 0.34239478548884383, + "grad_norm": 0.8852059245109558, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.5388, + "step": 25792 + }, + { + "epoch": 0.342806317682941, + "grad_norm": 0.8427268862724304, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.5467, + "step": 25823 + }, + { + "epoch": 0.3432178498770382, + "grad_norm": 0.9008413553237915, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.5529, + "step": 25854 + }, + { + "epoch": 0.34362938207113536, + "grad_norm": 0.8473567962646484, + "learning_rate": 3.0837769226467e-06, + "loss": 0.5459, + "step": 25885 + }, + { + "epoch": 0.3440409142652325, + "grad_norm": 0.8102667927742004, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.5348, + "step": 25916 + }, + { + "epoch": 0.34445244645932965, + "grad_norm": 0.8141647577285767, + "learning_rate": 3.003459147240753e-06, + "loss": 0.5457, + "step": 25947 + }, + { + "epoch": 0.34486397865342683, + "grad_norm": 0.9256607890129089, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.5431, + "step": 25978 + }, + { + "epoch": 0.345275510847524, + "grad_norm": 0.8449942469596863, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.5435, + "step": 26009 + }, + { + "epoch": 0.3456870430416212, + "grad_norm": 1.1406134366989136, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.5516, + "step": 26040 + }, + { + "epoch": 0.34609857523571835, + "grad_norm": 0.8967415690422058, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.5461, + "step": 26071 + }, + { + "epoch": 0.34651010742981553, + "grad_norm": 0.8100745677947998, + "learning_rate": 2.807016506436172e-06, + "loss": 0.5374, + "step": 26102 + }, + { + "epoch": 0.3469216396239127, + "grad_norm": 0.8833833336830139, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.5464, + "step": 26133 + }, + { + "epoch": 0.3473331718180098, + "grad_norm": 0.8851163983345032, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.5421, + "step": 26164 + }, + { + "epoch": 0.347744704012107, + "grad_norm": 0.8747218251228333, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.5388, + "step": 26195 + }, + { + "epoch": 0.3481562362062042, + "grad_norm": 0.779757022857666, + "learning_rate": 2.654367697581725e-06, + "loss": 0.5466, + "step": 26226 + }, + { + "epoch": 0.34856776840030135, + "grad_norm": 0.8362371325492859, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.5452, + "step": 26257 + }, + { + "epoch": 0.3489793005943985, + "grad_norm": 0.8213446736335754, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.5454, + "step": 26288 + }, + { + "epoch": 0.3493908327884957, + "grad_norm": 0.9033399820327759, + "learning_rate": 2.54252733160808e-06, + "loss": 0.5475, + "step": 26319 + }, + { + "epoch": 0.3498023649825929, + "grad_norm": 0.9243888258934021, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.5516, + "step": 26350 + }, + { + "epoch": 0.35021389717669005, + "grad_norm": 0.8325033783912659, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.5421, + "step": 26381 + }, + { + "epoch": 0.35062542937078717, + "grad_norm": 0.8969632387161255, + "learning_rate": 2.432967814215639e-06, + "loss": 0.5396, + "step": 26412 + }, + { + "epoch": 0.35103696156488434, + "grad_norm": 0.8884950876235962, + "learning_rate": 2.396956760238794e-06, + "loss": 0.5538, + "step": 26443 + }, + { + "epoch": 0.3514484937589815, + "grad_norm": 0.8323497176170349, + "learning_rate": 2.361200778532796e-06, + "loss": 0.5413, + "step": 26474 + }, + { + "epoch": 0.3518600259530787, + "grad_norm": 0.9132872223854065, + "learning_rate": 2.325700272599049e-06, + "loss": 0.5412, + "step": 26505 + }, + { + "epoch": 0.35227155814717587, + "grad_norm": 0.899863064289093, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.5451, + "step": 26536 + }, + { + "epoch": 0.35268309034127304, + "grad_norm": 0.7889094352722168, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.5476, + "step": 26567 + }, + { + "epoch": 0.3530946225353702, + "grad_norm": 0.8099322319030762, + "learning_rate": 2.220735601173002e-06, + "loss": 0.5474, + "step": 26598 + }, + { + "epoch": 0.35350615472946734, + "grad_norm": 0.8513230681419373, + "learning_rate": 2.186260975614382e-06, + "loss": 0.5403, + "step": 26629 + }, + { + "epoch": 0.3539176869235645, + "grad_norm": 0.8617785573005676, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.5407, + "step": 26660 + }, + { + "epoch": 0.3543292191176617, + "grad_norm": 0.8258427381515503, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.547, + "step": 26691 + }, + { + "epoch": 0.35474075131175886, + "grad_norm": 0.7984808087348938, + "learning_rate": 2.084383340238455e-06, + "loss": 0.5457, + "step": 26722 + }, + { + "epoch": 0.35515228350585604, + "grad_norm": 0.9225831627845764, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.5476, + "step": 26753 + }, + { + "epoch": 0.3555638156999532, + "grad_norm": 0.841090202331543, + "learning_rate": 2.017757276036403e-06, + "loss": 0.5404, + "step": 26784 + }, + { + "epoch": 0.3559753478940504, + "grad_norm": 0.8598360419273376, + "learning_rate": 1.984833083927726e-06, + "loss": 0.5551, + "step": 26815 + }, + { + "epoch": 0.35638688008814756, + "grad_norm": 0.9421056509017944, + "learning_rate": 1.952168614849581e-06, + "loss": 0.5493, + "step": 26846 + }, + { + "epoch": 0.3567984122822447, + "grad_norm": 0.807736873626709, + "learning_rate": 1.919764237416058e-06, + "loss": 0.5445, + "step": 26877 + }, + { + "epoch": 0.35720994447634186, + "grad_norm": 0.8544048070907593, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.5416, + "step": 26908 + }, + { + "epoch": 0.35762147667043903, + "grad_norm": 0.897087037563324, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.5405, + "step": 26939 + }, + { + "epoch": 0.3580330088645362, + "grad_norm": 0.8813446760177612, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.5546, + "step": 26970 + }, + { + "epoch": 0.3584445410586334, + "grad_norm": 0.8071566820144653, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.548, + "step": 27001 + }, + { + "epoch": 0.35885607325273056, + "grad_norm": 0.8715914487838745, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.5376, + "step": 27032 + }, + { + "epoch": 0.35926760544682773, + "grad_norm": 0.83490389585495, + "learning_rate": 1.730820169401584e-06, + "loss": 0.5474, + "step": 27063 + }, + { + "epoch": 0.3596791376409249, + "grad_norm": 0.9507847428321838, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.5471, + "step": 27094 + }, + { + "epoch": 0.360090669835022, + "grad_norm": 0.8561064004898071, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.5448, + "step": 27125 + }, + { + "epoch": 0.3605022020291192, + "grad_norm": 0.8557907342910767, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.5503, + "step": 27156 + }, + { + "epoch": 0.3609137342232164, + "grad_norm": 0.8815693259239197, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.5505, + "step": 27187 + }, + { + "epoch": 0.36132526641731355, + "grad_norm": 0.8523679375648499, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.5437, + "step": 27218 + }, + { + "epoch": 0.3617367986114107, + "grad_norm": 0.8898177742958069, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.5584, + "step": 27249 + }, + { + "epoch": 0.3621483308055079, + "grad_norm": 0.8666384220123291, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.5355, + "step": 27280 + }, + { + "epoch": 0.3625598629996051, + "grad_norm": 0.967224657535553, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.5491, + "step": 27311 + }, + { + "epoch": 0.36297139519370225, + "grad_norm": 0.9119516015052795, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.5348, + "step": 27342 + }, + { + "epoch": 0.36338292738779937, + "grad_norm": 0.9404922127723694, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.539, + "step": 27373 + }, + { + "epoch": 0.36379445958189655, + "grad_norm": 0.9016281366348267, + "learning_rate": 1.409026746485978e-06, + "loss": 0.5348, + "step": 27404 + }, + { + "epoch": 0.3642059917759937, + "grad_norm": 0.8831793069839478, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.5424, + "step": 27435 + }, + { + "epoch": 0.3646175239700909, + "grad_norm": 0.8272929191589355, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.5426, + "step": 27466 + }, + { + "epoch": 0.36502905616418807, + "grad_norm": 0.858782947063446, + "learning_rate": 1.3268374740224548e-06, + "loss": 0.543, + "step": 27497 + }, + { + "epoch": 0.36544058835828525, + "grad_norm": 0.8879000544548035, + "learning_rate": 1.2999749375008807e-06, + "loss": 0.5442, + "step": 27528 + }, + { + "epoch": 0.3658521205523824, + "grad_norm": 0.8690371513366699, + "learning_rate": 1.2733798525409346e-06, + "loss": 0.5372, + "step": 27559 + }, + { + "epoch": 0.3662636527464796, + "grad_norm": 0.8139171600341797, + "learning_rate": 1.2470525192645383e-06, + "loss": 0.5386, + "step": 27590 + }, + { + "epoch": 0.3666751849405767, + "grad_norm": 0.8036403059959412, + "learning_rate": 1.2209932347720666e-06, + "loss": 0.5442, + "step": 27621 + }, + { + "epoch": 0.3670867171346739, + "grad_norm": 0.8359389901161194, + "learning_rate": 1.1952022931389972e-06, + "loss": 0.5439, + "step": 27652 + }, + { + "epoch": 0.36749824932877106, + "grad_norm": 0.9095014333724976, + "learning_rate": 1.1696799854126083e-06, + "loss": 0.5441, + "step": 27683 + }, + { + "epoch": 0.36790978152286824, + "grad_norm": 0.8990742564201355, + "learning_rate": 1.1444265996086694e-06, + "loss": 0.5435, + "step": 27714 + }, + { + "epoch": 0.3683213137169654, + "grad_norm": 0.8429328799247742, + "learning_rate": 1.119442420708211e-06, + "loss": 0.5342, + "step": 27745 + }, + { + "epoch": 0.3687328459110626, + "grad_norm": 0.8839077949523926, + "learning_rate": 1.0947277306542964e-06, + "loss": 0.5472, + "step": 27776 + }, + { + "epoch": 0.36914437810515977, + "grad_norm": 0.8121281266212463, + "learning_rate": 1.0702828083488353e-06, + "loss": 0.5379, + "step": 27807 + }, + { + "epoch": 0.36955591029925694, + "grad_norm": 0.85428386926651, + "learning_rate": 1.0461079296494647e-06, + "loss": 0.5473, + "step": 27838 + }, + { + "epoch": 0.36996744249335406, + "grad_norm": 0.9345980286598206, + "learning_rate": 1.0222033673663978e-06, + "loss": 0.5443, + "step": 27869 + }, + { + "epoch": 0.37037897468745123, + "grad_norm": 0.8089588284492493, + "learning_rate": 9.985693912593713e-07, + "loss": 0.5423, + "step": 27900 + }, + { + "epoch": 0.3707905068815484, + "grad_norm": 0.8149998188018799, + "learning_rate": 9.752062680346035e-07, + "loss": 0.5383, + "step": 27931 + }, + { + "epoch": 0.3712020390756456, + "grad_norm": 0.8702203035354614, + "learning_rate": 9.521142613417494e-07, + "loss": 0.5476, + "step": 27962 + }, + { + "epoch": 0.37161357126974276, + "grad_norm": 0.848631739616394, + "learning_rate": 9.292936317709722e-07, + "loss": 0.548, + "step": 27993 + }, + { + "epoch": 0.37202510346383993, + "grad_norm": 0.8195529580116272, + "learning_rate": 9.067446368499793e-07, + "loss": 0.5314, + "step": 28024 + }, + { + "epoch": 0.3724366356579371, + "grad_norm": 0.8010855317115784, + "learning_rate": 8.844675310411055e-07, + "loss": 0.5447, + "step": 28055 + }, + { + "epoch": 0.3728481678520343, + "grad_norm": 0.8277648091316223, + "learning_rate": 8.6246256573847e-07, + "loss": 0.5424, + "step": 28086 + }, + { + "epoch": 0.3732597000461314, + "grad_norm": 0.8340747356414795, + "learning_rate": 8.407299892651127e-07, + "loss": 0.5505, + "step": 28117 + }, + { + "epoch": 0.3736712322402286, + "grad_norm": 0.8436799049377441, + "learning_rate": 8.19270046870202e-07, + "loss": 0.5407, + "step": 28148 + }, + { + "epoch": 0.37408276443432575, + "grad_norm": 0.8872787356376648, + "learning_rate": 7.980829807262752e-07, + "loss": 0.5479, + "step": 28179 + }, + { + "epoch": 0.37449429662842293, + "grad_norm": 0.7998986840248108, + "learning_rate": 7.771690299264889e-07, + "loss": 0.536, + "step": 28210 + }, + { + "epoch": 0.3749058288225201, + "grad_norm": 0.8621476292610168, + "learning_rate": 7.565284304819426e-07, + "loss": 0.5516, + "step": 28241 + }, + { + "epoch": 0.3753173610166173, + "grad_norm": 0.9056204557418823, + "learning_rate": 7.361614153189922e-07, + "loss": 0.5442, + "step": 28272 + }, + { + "epoch": 0.37572889321071445, + "grad_norm": 0.8797835111618042, + "learning_rate": 7.160682142766328e-07, + "loss": 0.54, + "step": 28303 + }, + { + "epoch": 0.37614042540481163, + "grad_norm": 0.8731541037559509, + "learning_rate": 6.962490541039091e-07, + "loss": 0.5402, + "step": 28334 + }, + { + "epoch": 0.37655195759890875, + "grad_norm": 0.802355945110321, + "learning_rate": 6.767041584573531e-07, + "loss": 0.5466, + "step": 28365 + }, + { + "epoch": 0.3769634897930059, + "grad_norm": 0.8426799178123474, + "learning_rate": 6.574337478984532e-07, + "loss": 0.5484, + "step": 28396 + }, + { + "epoch": 0.3773750219871031, + "grad_norm": 0.8554813861846924, + "learning_rate": 6.384380398911732e-07, + "loss": 0.5387, + "step": 28427 + }, + { + "epoch": 0.3777865541812003, + "grad_norm": 0.8815711140632629, + "learning_rate": 6.197172487994951e-07, + "loss": 0.5392, + "step": 28458 + }, + { + "epoch": 0.37819808637529745, + "grad_norm": 0.8735175728797913, + "learning_rate": 6.012715858850021e-07, + "loss": 0.5405, + "step": 28489 + }, + { + "epoch": 0.3786096185693946, + "grad_norm": 0.8632252216339111, + "learning_rate": 5.831012593044971e-07, + "loss": 0.5472, + "step": 28520 + }, + { + "epoch": 0.3790211507634918, + "grad_norm": 0.8865644931793213, + "learning_rate": 5.652064741076435e-07, + "loss": 0.5449, + "step": 28551 + }, + { + "epoch": 0.379432682957589, + "grad_norm": 0.8747300505638123, + "learning_rate": 5.475874322346558e-07, + "loss": 0.5376, + "step": 28582 + }, + { + "epoch": 0.3798442151516861, + "grad_norm": 0.8061609864234924, + "learning_rate": 5.30244332514035e-07, + "loss": 0.5452, + "step": 28613 + }, + { + "epoch": 0.38025574734578327, + "grad_norm": 0.8865931630134583, + "learning_rate": 5.131773706602977e-07, + "loss": 0.5417, + "step": 28644 + }, + { + "epoch": 0.38066727953988044, + "grad_norm": 0.8096855282783508, + "learning_rate": 4.963867392717897e-07, + "loss": 0.5449, + "step": 28675 + }, + { + "epoch": 0.3810788117339776, + "grad_norm": 0.7933251261711121, + "learning_rate": 4.798726278285093e-07, + "loss": 0.5358, + "step": 28706 + }, + { + "epoch": 0.3814903439280748, + "grad_norm": 0.9222235083580017, + "learning_rate": 4.6363522268995097e-07, + "loss": 0.5488, + "step": 28737 + }, + { + "epoch": 0.38190187612217197, + "grad_norm": 0.8724942207336426, + "learning_rate": 4.4767470709302927e-07, + "loss": 0.5411, + "step": 28768 + }, + { + "epoch": 0.38231340831626914, + "grad_norm": 1.0145565271377563, + "learning_rate": 4.319912611499971e-07, + "loss": 0.5453, + "step": 28799 + }, + { + "epoch": 0.3827249405103663, + "grad_norm": 0.8948010802268982, + "learning_rate": 4.1658506184640564e-07, + "loss": 0.5492, + "step": 28830 + }, + { + "epoch": 0.38313647270446344, + "grad_norm": 0.8219355344772339, + "learning_rate": 4.0145628303911996e-07, + "loss": 0.551, + "step": 28861 + }, + { + "epoch": 0.3835480048985606, + "grad_norm": 0.8625403642654419, + "learning_rate": 3.866050954543565e-07, + "loss": 0.5431, + "step": 28892 + }, + { + "epoch": 0.3839595370926578, + "grad_norm": 0.9696141481399536, + "learning_rate": 3.720316666857432e-07, + "loss": 0.5434, + "step": 28923 + }, + { + "epoch": 0.38437106928675496, + "grad_norm": 0.8571056723594666, + "learning_rate": 3.5773616119244845e-07, + "loss": 0.5483, + "step": 28954 + }, + { + "epoch": 0.38478260148085214, + "grad_norm": 0.8465270400047302, + "learning_rate": 3.437187402973052e-07, + "loss": 0.5442, + "step": 28985 + }, + { + "epoch": 0.3851941336749493, + "grad_norm": 0.896527886390686, + "learning_rate": 3.2997956218500104e-07, + "loss": 0.543, + "step": 29016 + }, + { + "epoch": 0.3856056658690465, + "grad_norm": 0.9935593008995056, + "learning_rate": 3.165187819003018e-07, + "loss": 0.5394, + "step": 29047 + }, + { + "epoch": 0.38601719806314366, + "grad_norm": 0.9212044477462769, + "learning_rate": 3.033365513462755e-07, + "loss": 0.5481, + "step": 29078 + }, + { + "epoch": 0.3864287302572408, + "grad_norm": 0.8404513001441956, + "learning_rate": 2.9043301928260437e-07, + "loss": 0.547, + "step": 29109 + }, + { + "epoch": 0.38684026245133796, + "grad_norm": 0.7884857058525085, + "learning_rate": 2.7780833132389773e-07, + "loss": 0.5416, + "step": 29140 + }, + { + "epoch": 0.38725179464543513, + "grad_norm": 0.8256231546401978, + "learning_rate": 2.6546262993803473e-07, + "loss": 0.5418, + "step": 29171 + }, + { + "epoch": 0.3876633268395323, + "grad_norm": 0.9019240140914917, + "learning_rate": 2.533960544445879e-07, + "loss": 0.5425, + "step": 29202 + }, + { + "epoch": 0.3880748590336295, + "grad_norm": 0.8593041896820068, + "learning_rate": 2.416087410132134e-07, + "loss": 0.5418, + "step": 29233 + }, + { + "epoch": 0.38848639122772666, + "grad_norm": 0.8224918842315674, + "learning_rate": 2.301008226621465e-07, + "loss": 0.5445, + "step": 29264 + }, + { + "epoch": 0.38889792342182383, + "grad_norm": 0.8012319803237915, + "learning_rate": 2.1887242925668073e-07, + "loss": 0.5442, + "step": 29295 + }, + { + "epoch": 0.389309455615921, + "grad_norm": 0.8798550963401794, + "learning_rate": 2.0792368750770785e-07, + "loss": 0.5421, + "step": 29326 + }, + { + "epoch": 0.3897209878100181, + "grad_norm": 0.8771138191223145, + "learning_rate": 1.9725472097028851e-07, + "loss": 0.5363, + "step": 29357 + }, + { + "epoch": 0.3901325200041153, + "grad_norm": 0.8535872101783752, + "learning_rate": 1.8686565004226718e-07, + "loss": 0.5494, + "step": 29388 + }, + { + "epoch": 0.3905440521982125, + "grad_norm": 0.9166486859321594, + "learning_rate": 1.7675659196288995e-07, + "loss": 0.5428, + "step": 29419 + }, + { + "epoch": 0.39095558439230965, + "grad_norm": 0.8581733107566833, + "learning_rate": 1.6692766081150556e-07, + "loss": 0.5496, + "step": 29450 + }, + { + "epoch": 0.3913671165864068, + "grad_norm": 0.9080056548118591, + "learning_rate": 1.5737896750626647e-07, + "loss": 0.5373, + "step": 29481 + }, + { + "epoch": 0.391778648780504, + "grad_norm": 0.872186005115509, + "learning_rate": 1.4811061980287976e-07, + "loss": 0.5419, + "step": 29512 + }, + { + "epoch": 0.3921901809746012, + "grad_norm": 0.8389427065849304, + "learning_rate": 1.3912272229338886e-07, + "loss": 0.5464, + "step": 29543 + }, + { + "epoch": 0.39260171316869835, + "grad_norm": 0.8263513445854187, + "learning_rate": 1.3041537640499645e-07, + "loss": 0.5467, + "step": 29574 + }, + { + "epoch": 0.39301324536279547, + "grad_norm": 0.862415075302124, + "learning_rate": 1.2198868039891564e-07, + "loss": 0.5476, + "step": 29605 + }, + { + "epoch": 0.39342477755689265, + "grad_norm": 0.828647255897522, + "learning_rate": 1.138427293692651e-07, + "loss": 0.5426, + "step": 29636 + }, + { + "epoch": 0.3938363097509898, + "grad_norm": 0.8856072425842285, + "learning_rate": 1.0597761524199778e-07, + "loss": 0.5408, + "step": 29667 + }, + { + "epoch": 0.394247841945087, + "grad_norm": 0.8782724142074585, + "learning_rate": 9.839342677385455e-08, + "loss": 0.5438, + "step": 29698 + }, + { + "epoch": 0.39465937413918417, + "grad_norm": 0.8528609871864319, + "learning_rate": 9.109024955137325e-08, + "loss": 0.5476, + "step": 29729 + }, + { + "epoch": 0.39507090633328135, + "grad_norm": 0.8713068962097168, + "learning_rate": 8.406816598991729e-08, + "loss": 0.5404, + "step": 29760 + }, + { + "epoch": 0.3954824385273785, + "grad_norm": 0.844412088394165, + "learning_rate": 7.73272553327431e-08, + "loss": 0.5358, + "step": 29791 + }, + { + "epoch": 0.3958939707214757, + "grad_norm": 0.8629502654075623, + "learning_rate": 7.086759365011186e-08, + "loss": 0.5434, + "step": 29822 + }, + { + "epoch": 0.3963055029155728, + "grad_norm": 0.9143503308296204, + "learning_rate": 6.468925383842639e-08, + "loss": 0.5463, + "step": 29853 + }, + { + "epoch": 0.39671703510967, + "grad_norm": 0.8429513573646545, + "learning_rate": 5.8792305619415067e-08, + "loss": 0.5379, + "step": 29884 + }, + { + "epoch": 0.39712856730376717, + "grad_norm": 0.8939641118049622, + "learning_rate": 5.317681553933529e-08, + "loss": 0.5432, + "step": 29915 + }, + { + "epoch": 0.39754009949786434, + "grad_norm": 0.8502975702285767, + "learning_rate": 4.78428469682296e-08, + "loss": 0.5505, + "step": 29946 + }, + { + "epoch": 0.3979516316919615, + "grad_norm": 0.8307493925094604, + "learning_rate": 4.2790460099206844e-08, + "loss": 0.5373, + "step": 29977 + }, + { + "epoch": 0.3983631638860587, + "grad_norm": 0.8117762207984924, + "learning_rate": 3.801971194777043e-08, + "loss": 0.544, + "step": 30008 + }, + { + "epoch": 0.39877469608015587, + "grad_norm": 0.8374053835868835, + "learning_rate": 3.353065635115782e-08, + "loss": 0.5374, + "step": 30039 + }, + { + "epoch": 0.39918622827425304, + "grad_norm": 0.8347547650337219, + "learning_rate": 2.93233439677576e-08, + "loss": 0.5391, + "step": 30070 + }, + { + "epoch": 0.39959776046835016, + "grad_norm": 0.8406876921653748, + "learning_rate": 2.539782227651555e-08, + "loss": 0.5435, + "step": 30101 + }, + { + "epoch": 0.40000929266244734, + "grad_norm": 0.8617453575134277, + "learning_rate": 2.175413557641004e-08, + "loss": 0.5476, + "step": 30132 + }, + { + "epoch": 0.4004208248565445, + "grad_norm": 0.8191389441490173, + "learning_rate": 1.839232498594967e-08, + "loss": 0.5399, + "step": 30163 + }, + { + "epoch": 0.4008323570506417, + "grad_norm": 0.8725799322128296, + "learning_rate": 1.5312428442712522e-08, + "loss": 0.5421, + "step": 30194 + }, + { + "epoch": 0.40124388924473886, + "grad_norm": 0.8779147267341614, + "learning_rate": 1.2514480702913168e-08, + "loss": 0.5447, + "step": 30225 + }, + { + "epoch": 0.40165542143883604, + "grad_norm": 0.8182644844055176, + "learning_rate": 9.998513341005766e-09, + "loss": 0.5393, + "step": 30256 + }, + { + "epoch": 0.4020669536329332, + "grad_norm": 0.8462246656417847, + "learning_rate": 7.764554749345454e-09, + "loss": 0.5368, + "step": 30287 + }, + { + "epoch": 0.4024784858270304, + "grad_norm": 0.8804129958152771, + "learning_rate": 5.812630137849717e-09, + "loss": 0.5394, + "step": 30318 + }, + { + "epoch": 0.4028900180211275, + "grad_norm": 0.8647791147232056, + "learning_rate": 4.142761533723616e-09, + "loss": 0.5422, + "step": 30349 + }, + { + "epoch": 0.4033015502152247, + "grad_norm": 0.8603995442390442, + "learning_rate": 2.7549677812044317e-09, + "loss": 0.5395, + "step": 30380 + }, + { + "epoch": 0.40371308240932186, + "grad_norm": 0.877536416053772, + "learning_rate": 1.6492645413590525e-09, + "loss": 0.5436, + "step": 30411 + }, + { + "epoch": 0.40412461460341903, + "grad_norm": 0.83576500415802, + "learning_rate": 8.256642918980096e-10, + "loss": 0.5403, + "step": 30442 + }, + { + "epoch": 0.4045361467975162, + "grad_norm": 0.7631540894508362, + "learning_rate": 2.841763270367004e-10, + "loss": 0.5382, + "step": 30473 + }, + { + "epoch": 0.4049476789916134, + "grad_norm": 0.837101399898529, + "learning_rate": 2.480675739269245e-11, + "loss": 0.544, + "step": 30504 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.251434749612104e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-30517/training_args.bin b/checkpoint-30517/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-30517/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-3052/config.json b/checkpoint-3052/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-3052/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-3052/generation_config.json b/checkpoint-3052/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-3052/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-3052/model-00001-of-00007.safetensors b/checkpoint-3052/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..30c03170431453f2c3d5c381c2590f27ac7a1043 --- /dev/null +++ b/checkpoint-3052/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7d93b57e43befc30fe0bc36abac5bcbf277f88ba0fe99d3e356b575e283d548 +size 4886466168 diff --git a/checkpoint-3052/model-00002-of-00007.safetensors b/checkpoint-3052/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-3052/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-3052/model-00003-of-00007.safetensors b/checkpoint-3052/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-3052/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-3052/model-00004-of-00007.safetensors b/checkpoint-3052/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-3052/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-3052/model-00005-of-00007.safetensors b/checkpoint-3052/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-3052/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-3052/model-00006-of-00007.safetensors b/checkpoint-3052/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cac33b2e89191f6204518e417f3d7bfdc72d270c --- /dev/null +++ b/checkpoint-3052/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6d8ddb339e7a5f978a8b95c379059d6f80840001cf6163d3bfb2cf0a753b97 +size 4999813120 diff --git a/checkpoint-3052/model-00007-of-00007.safetensors b/checkpoint-3052/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..db70df63e2f9e66d6c507e13f5031a941a11f577 --- /dev/null +++ b/checkpoint-3052/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd09ca03e691288f160576056759bf67abd4fe3dbd59d1ed07eba205b415d789 +size 2571158184 diff --git a/checkpoint-3052/model.safetensors.index.json b/checkpoint-3052/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-3052/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-3052/optimizer.pt b/checkpoint-3052/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a164db94812e1a3f6d990361c53ace41c6e377d5 --- /dev/null +++ b/checkpoint-3052/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea831d012c9593a7492ae965d9cffe34a4898b8d8a68c47405e2fc57fee55c82 +size 15385036334 diff --git a/checkpoint-3052/rng_state.pth b/checkpoint-3052/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-3052/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-3052/scheduler.pt b/checkpoint-3052/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b2454d919340cd4d989697a74a27016c58dc3aa --- /dev/null +++ b/checkpoint-3052/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed9d7fea0b9f468b8c97fd491e0f5a211b8ff197e5f8111c42fc974ecafed4c +size 1064 diff --git a/checkpoint-3052/trainer_state.json b/checkpoint-3052/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7dd2c77eafa3c156145199d3f954ed0f835b5651 --- /dev/null +++ b/checkpoint-3052/trainer_state.json @@ -0,0 +1,719 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.040516008270469576, + "eval_steps": 500, + "global_step": 3052, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.251656078846591e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3052/training_args.bin b/checkpoint-3052/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-3052/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-6104/config.json b/checkpoint-6104/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-6104/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-6104/generation_config.json b/checkpoint-6104/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-6104/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-6104/model-00001-of-00007.safetensors b/checkpoint-6104/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a2b79ab61d2934866d9eab3c4136c499273f38a4 --- /dev/null +++ b/checkpoint-6104/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00c83677472a5f6bc7a72b22f741bf6b118b01d6d9fd4ca1f3fe536c67e0d54 +size 4886466168 diff --git a/checkpoint-6104/model-00002-of-00007.safetensors b/checkpoint-6104/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-6104/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-6104/model-00003-of-00007.safetensors b/checkpoint-6104/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-6104/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-6104/model-00004-of-00007.safetensors b/checkpoint-6104/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-6104/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-6104/model-00005-of-00007.safetensors b/checkpoint-6104/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-6104/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-6104/model-00006-of-00007.safetensors b/checkpoint-6104/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5534c7cdaca6f3dc1cf661d9eea3508d3a941f13 --- /dev/null +++ b/checkpoint-6104/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c20a051a9835a8174e9e8d20e3893d2e6fc3304b0c7f3f4407ea7638932ee0e +size 4999813120 diff --git a/checkpoint-6104/model-00007-of-00007.safetensors b/checkpoint-6104/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0e4674538eb8f419751cb59f72e5337fc534774b --- /dev/null +++ b/checkpoint-6104/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b325f84f7613acd29e56e45940b2033d92b9d2e275ab139ae811a9ecdaaab5f1 +size 2571158184 diff --git a/checkpoint-6104/model.safetensors.index.json b/checkpoint-6104/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-6104/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-6104/optimizer.pt b/checkpoint-6104/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c37a4d67adf7fca67a3380950280a74d0fcd8fb --- /dev/null +++ b/checkpoint-6104/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17d2cfc069169042901035aa41b2bf166e70215751fa2e12dbe5f193032029ce +size 15385036334 diff --git a/checkpoint-6104/rng_state.pth b/checkpoint-6104/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-6104/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-6104/scheduler.pt b/checkpoint-6104/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f5e5514ba898102fcdb5731bba1ae1c2957e6e5 --- /dev/null +++ b/checkpoint-6104/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107e0617754026d870a7da422dabb716a8dc7d3a550066ff507e37f8f0818429 +size 1064 diff --git a/checkpoint-6104/trainer_state.json b/checkpoint-6104/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f740aa299575a61ce756731a94ca9991e6b819d9 --- /dev/null +++ b/checkpoint-6104/trainer_state.json @@ -0,0 +1,1405 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08103201654093915, + "eval_steps": 500, + "global_step": 6104, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.503312157693182e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6104/training_args.bin b/checkpoint-6104/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-6104/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/checkpoint-9156/config.json b/checkpoint-9156/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-9156/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-9156/generation_config.json b/checkpoint-9156/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-9156/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-9156/model-00001-of-00007.safetensors b/checkpoint-9156/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fcdbd97bde91299ab66af5d53b8d4b3eacb6d0a3 --- /dev/null +++ b/checkpoint-9156/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b41b53552220958b5f298f0352e541b94c5af49aff142c395c39451ccf5c10ad +size 4886466168 diff --git a/checkpoint-9156/model-00002-of-00007.safetensors b/checkpoint-9156/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-9156/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-9156/model-00003-of-00007.safetensors b/checkpoint-9156/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-9156/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-9156/model-00004-of-00007.safetensors b/checkpoint-9156/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-9156/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-9156/model-00005-of-00007.safetensors b/checkpoint-9156/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-9156/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-9156/model-00006-of-00007.safetensors b/checkpoint-9156/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8f186a79d829fda6e0a60ddc14a0aa6212b64c9f --- /dev/null +++ b/checkpoint-9156/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2f202441b060142b1bbfa0dd9538a005a0d88dde2be863532cfdbc6b02c1cda +size 4999813120 diff --git a/checkpoint-9156/model-00007-of-00007.safetensors b/checkpoint-9156/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..044313ca50eeaa4697a4326a506efa2dd3a5275c --- /dev/null +++ b/checkpoint-9156/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca012a42c328043c0cf88515577f316de1ac520cb665e22b6b852de17ef75dd3 +size 2571158184 diff --git a/checkpoint-9156/model.safetensors.index.json b/checkpoint-9156/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-9156/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-9156/optimizer.pt b/checkpoint-9156/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..09780be455e3eb2eb588a307906a5584ed327da5 --- /dev/null +++ b/checkpoint-9156/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:debc3bbd5337f6e7509ef3e9f9076edf1d7fc79144160d5068c8bd952afd41e7 +size 15385036334 diff --git a/checkpoint-9156/rng_state.pth b/checkpoint-9156/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-9156/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-9156/scheduler.pt b/checkpoint-9156/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5456a295e7e9e24785bebf5e96ccb62dbbac4f62 --- /dev/null +++ b/checkpoint-9156/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3f410c61b11096714461ebc2a4aa1b4573d0d0c3eb997bda14fafb34cdc922 +size 1064 diff --git a/checkpoint-9156/trainer_state.json b/checkpoint-9156/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8868000fe980d8389710b8bd0fa1ee53eeb28561 --- /dev/null +++ b/checkpoint-9156/trainer_state.json @@ -0,0 +1,2098 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.12154802481140874, + "eval_steps": 500, + "global_step": 9156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00041153219409716807, + "grad_norm": 4.147768020629883, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.7978, + "step": 31 + }, + { + "epoch": 0.0008230643881943361, + "grad_norm": 3.117840051651001, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.7723, + "step": 62 + }, + { + "epoch": 0.0012345965822915042, + "grad_norm": 2.83072829246521, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7493, + "step": 93 + }, + { + "epoch": 0.0016461287763886723, + "grad_norm": 2.9491968154907227, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.0020576609704858403, + "grad_norm": 2.798553228378296, + "learning_rate": 5.078636959370905e-06, + "loss": 0.7229, + "step": 155 + }, + { + "epoch": 0.0024691931645830084, + "grad_norm": 2.663148880004883, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7253, + "step": 186 + }, + { + "epoch": 0.0028807253586801765, + "grad_norm": 2.564692974090576, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7287, + "step": 217 + }, + { + "epoch": 0.0032922575527773446, + "grad_norm": 3.2043237686157227, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7205, + "step": 248 + }, + { + "epoch": 0.0037037897468745126, + "grad_norm": 2.6705269813537598, + "learning_rate": 9.141546526867629e-06, + "loss": 0.714, + "step": 279 + }, + { + "epoch": 0.004115321940971681, + "grad_norm": 3.432569980621338, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7217, + "step": 310 + }, + { + "epoch": 0.004526854135068849, + "grad_norm": 2.515808343887329, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7173, + "step": 341 + }, + { + "epoch": 0.004938386329166017, + "grad_norm": 2.6708128452301025, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7123, + "step": 372 + }, + { + "epoch": 0.005349918523263185, + "grad_norm": 2.472797155380249, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.7079, + "step": 403 + }, + { + "epoch": 0.005761450717360353, + "grad_norm": 2.486147880554199, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7146, + "step": 434 + }, + { + "epoch": 0.006172982911457521, + "grad_norm": 2.310777187347412, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7, + "step": 465 + }, + { + "epoch": 0.006584515105554689, + "grad_norm": 3.1831858158111572, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7042, + "step": 496 + }, + { + "epoch": 0.006996047299651857, + "grad_norm": 2.505427360534668, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.6991, + "step": 527 + }, + { + "epoch": 0.007407579493749025, + "grad_norm": 2.3627443313598633, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7045, + "step": 558 + }, + { + "epoch": 0.007819111687846193, + "grad_norm": 2.6250648498535156, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.704, + "step": 589 + }, + { + "epoch": 0.008230643881943361, + "grad_norm": 2.3151464462280273, + "learning_rate": 2.031454783748362e-05, + "loss": 0.6959, + "step": 620 + }, + { + "epoch": 0.00864217607604053, + "grad_norm": 2.0312647819519043, + "learning_rate": 2.13302752293578e-05, + "loss": 0.689, + "step": 651 + }, + { + "epoch": 0.009053708270137698, + "grad_norm": 2.718970775604248, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7031, + "step": 682 + }, + { + "epoch": 0.009465240464234866, + "grad_norm": 1.9124070405960083, + "learning_rate": 2.336173001310616e-05, + "loss": 0.6908, + "step": 713 + }, + { + "epoch": 0.009876772658332034, + "grad_norm": 2.368354082107544, + "learning_rate": 2.437745740498034e-05, + "loss": 0.6947, + "step": 744 + }, + { + "epoch": 0.010288304852429202, + "grad_norm": 1.8856632709503174, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.6876, + "step": 775 + }, + { + "epoch": 0.01069983704652637, + "grad_norm": 2.3155925273895264, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7024, + "step": 806 + }, + { + "epoch": 0.011111369240623538, + "grad_norm": 2.2516837120056152, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.695, + "step": 837 + }, + { + "epoch": 0.011522901434720706, + "grad_norm": 2.4444823265075684, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.6892, + "step": 868 + }, + { + "epoch": 0.011934433628817874, + "grad_norm": 2.4233832359313965, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.6871, + "step": 899 + }, + { + "epoch": 0.012345965822915042, + "grad_norm": 2.0262961387634277, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.6888, + "step": 930 + }, + { + "epoch": 0.01275749801701221, + "grad_norm": 2.3908143043518066, + "learning_rate": 3.148754914809961e-05, + "loss": 0.6997, + "step": 961 + }, + { + "epoch": 0.013169030211109378, + "grad_norm": 1.839417576789856, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.6826, + "step": 992 + }, + { + "epoch": 0.013580562405206546, + "grad_norm": 1.8175997734069824, + "learning_rate": 3.351900393184797e-05, + "loss": 0.6857, + "step": 1023 + }, + { + "epoch": 0.013992094599303714, + "grad_norm": 2.3080506324768066, + "learning_rate": 3.453473132372215e-05, + "loss": 0.6896, + "step": 1054 + }, + { + "epoch": 0.014403626793400882, + "grad_norm": 2.0574960708618164, + "learning_rate": 3.555045871559633e-05, + "loss": 0.6862, + "step": 1085 + }, + { + "epoch": 0.01481515898749805, + "grad_norm": 1.957221269607544, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.6796, + "step": 1116 + }, + { + "epoch": 0.015226691181595219, + "grad_norm": 2.2740609645843506, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.7082, + "step": 1147 + }, + { + "epoch": 0.015638223375692387, + "grad_norm": 2.1291615962982178, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.6852, + "step": 1178 + }, + { + "epoch": 0.016049755569789555, + "grad_norm": 2.2918620109558105, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.6821, + "step": 1209 + }, + { + "epoch": 0.016461287763886723, + "grad_norm": 1.9689708948135376, + "learning_rate": 4.062909567496724e-05, + "loss": 0.676, + "step": 1240 + }, + { + "epoch": 0.01687281995798389, + "grad_norm": 2.4405298233032227, + "learning_rate": 4.164482306684142e-05, + "loss": 0.6927, + "step": 1271 + }, + { + "epoch": 0.01728435215208106, + "grad_norm": 2.0007450580596924, + "learning_rate": 4.26605504587156e-05, + "loss": 0.6864, + "step": 1302 + }, + { + "epoch": 0.017695884346178227, + "grad_norm": 1.7013226747512817, + "learning_rate": 4.367627785058978e-05, + "loss": 0.6827, + "step": 1333 + }, + { + "epoch": 0.018107416540275395, + "grad_norm": 2.2202930450439453, + "learning_rate": 4.469200524246396e-05, + "loss": 0.6722, + "step": 1364 + }, + { + "epoch": 0.018518948734372563, + "grad_norm": 1.690030813217163, + "learning_rate": 4.570773263433814e-05, + "loss": 0.6807, + "step": 1395 + }, + { + "epoch": 0.01893048092846973, + "grad_norm": 1.5886859893798828, + "learning_rate": 4.672346002621232e-05, + "loss": 0.6867, + "step": 1426 + }, + { + "epoch": 0.0193420131225669, + "grad_norm": 1.7371172904968262, + "learning_rate": 4.77391874180865e-05, + "loss": 0.6832, + "step": 1457 + }, + { + "epoch": 0.019753545316664067, + "grad_norm": 1.8376264572143555, + "learning_rate": 4.875491480996068e-05, + "loss": 0.681, + "step": 1488 + }, + { + "epoch": 0.020165077510761235, + "grad_norm": 1.6280311346054077, + "learning_rate": 4.977064220183487e-05, + "loss": 0.6795, + "step": 1519 + }, + { + "epoch": 0.020576609704858403, + "grad_norm": 1.8016823530197144, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.6789, + "step": 1550 + }, + { + "epoch": 0.02098814189895557, + "grad_norm": 1.7355237007141113, + "learning_rate": 4.999955597496219e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.02139967409305274, + "grad_norm": 1.5687408447265625, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.6777, + "step": 1612 + }, + { + "epoch": 0.021811206287149908, + "grad_norm": 1.790453314781189, + "learning_rate": 4.999799067923527e-05, + "loss": 0.684, + "step": 1643 + }, + { + "epoch": 0.022222738481247076, + "grad_norm": 1.9073406457901, + "learning_rate": 4.999678487776908e-05, + "loss": 0.6793, + "step": 1674 + }, + { + "epoch": 0.022634270675344244, + "grad_norm": 1.5575100183486938, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.6794, + "step": 1705 + }, + { + "epoch": 0.023045802869441412, + "grad_norm": 2.100715160369873, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6753, + "step": 1736 + }, + { + "epoch": 0.02345733506353858, + "grad_norm": 1.9425480365753174, + "learning_rate": 4.999147503179668e-05, + "loss": 0.6755, + "step": 1767 + }, + { + "epoch": 0.023868867257635748, + "grad_norm": 1.5791170597076416, + "learning_rate": 4.998914100252672e-05, + "loss": 0.6754, + "step": 1798 + }, + { + "epoch": 0.024280399451732916, + "grad_norm": 1.5692673921585083, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6682, + "step": 1829 + }, + { + "epoch": 0.024691931645830084, + "grad_norm": 1.5757725238800049, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.6723, + "step": 1860 + }, + { + "epoch": 0.025103463839927252, + "grad_norm": 1.8000696897506714, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6647, + "step": 1891 + }, + { + "epoch": 0.02551499603402442, + "grad_norm": 1.682919979095459, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.669, + "step": 1922 + }, + { + "epoch": 0.02592652822812159, + "grad_norm": 1.6138657331466675, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6698, + "step": 1953 + }, + { + "epoch": 0.026338060422218756, + "grad_norm": 1.5016391277313232, + "learning_rate": 4.99692159912661e-05, + "loss": 0.6645, + "step": 1984 + }, + { + "epoch": 0.026749592616315924, + "grad_norm": 1.6850241422653198, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6769, + "step": 2015 + }, + { + "epoch": 0.027161124810413093, + "grad_norm": 1.521161437034607, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6599, + "step": 2046 + }, + { + "epoch": 0.02757265700451026, + "grad_norm": 1.8039467334747314, + "learning_rate": 4.995544899210594e-05, + "loss": 0.6693, + "step": 2077 + }, + { + "epoch": 0.02798418919860743, + "grad_norm": 1.688811182975769, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.661, + "step": 2108 + }, + { + "epoch": 0.028395721392704597, + "grad_norm": 1.669871211051941, + "learning_rate": 4.994486281210429e-05, + "loss": 0.6611, + "step": 2139 + }, + { + "epoch": 0.028807253586801765, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6677, + "step": 2170 + }, + { + "epoch": 0.029218785780898933, + "grad_norm": 1.3242331743240356, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6537, + "step": 2201 + }, + { + "epoch": 0.0296303179749961, + "grad_norm": 1.6638622283935547, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6653, + "step": 2232 + }, + { + "epoch": 0.03004185016909327, + "grad_norm": 1.5110372304916382, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6615, + "step": 2263 + }, + { + "epoch": 0.030453382363190437, + "grad_norm": 1.5244065523147583, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.6552, + "step": 2294 + }, + { + "epoch": 0.030864914557287605, + "grad_norm": 1.416624903678894, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6569, + "step": 2325 + }, + { + "epoch": 0.03127644675138477, + "grad_norm": 1.4467304944992065, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6613, + "step": 2356 + }, + { + "epoch": 0.03168797894548194, + "grad_norm": 1.4649864435195923, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6604, + "step": 2387 + }, + { + "epoch": 0.03209951113957911, + "grad_norm": 1.6256624460220337, + "learning_rate": 4.988329891293693e-05, + "loss": 0.655, + "step": 2418 + }, + { + "epoch": 0.03251104333367628, + "grad_norm": 1.3986462354660034, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6514, + "step": 2449 + }, + { + "epoch": 0.032922575527773446, + "grad_norm": 1.4490609169006348, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.65, + "step": 2480 + }, + { + "epoch": 0.033334107721870614, + "grad_norm": 1.4979615211486816, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.6579, + "step": 2511 + }, + { + "epoch": 0.03374563991596778, + "grad_norm": 1.6314005851745605, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6604, + "step": 2542 + }, + { + "epoch": 0.03415717211006495, + "grad_norm": 1.310655951499939, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6518, + "step": 2573 + }, + { + "epoch": 0.03456870430416212, + "grad_norm": 1.4176305532455444, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6518, + "step": 2604 + }, + { + "epoch": 0.034980236498259286, + "grad_norm": 1.6330211162567139, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6474, + "step": 2635 + }, + { + "epoch": 0.035391768692356454, + "grad_norm": 1.5854465961456299, + "learning_rate": 4.980947995086024e-05, + "loss": 0.6462, + "step": 2666 + }, + { + "epoch": 0.03580330088645362, + "grad_norm": 1.2944835424423218, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6503, + "step": 2697 + }, + { + "epoch": 0.03621483308055079, + "grad_norm": 1.4056243896484375, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6414, + "step": 2728 + }, + { + "epoch": 0.03662636527464796, + "grad_norm": 1.5649598836898804, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6449, + "step": 2759 + }, + { + "epoch": 0.037037897468745126, + "grad_norm": 1.5205659866333008, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6334, + "step": 2790 + }, + { + "epoch": 0.037449429662842294, + "grad_norm": 1.3248411417007446, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6448, + "step": 2821 + }, + { + "epoch": 0.03786096185693946, + "grad_norm": 1.2402842044830322, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6476, + "step": 2852 + }, + { + "epoch": 0.03827249405103663, + "grad_norm": 1.7939503192901611, + "learning_rate": 4.973018858007122e-05, + "loss": 0.6472, + "step": 2883 + }, + { + "epoch": 0.0386840262451338, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6374, + "step": 2914 + }, + { + "epoch": 0.03909555843923097, + "grad_norm": 1.6384273767471313, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6623, + "step": 2945 + }, + { + "epoch": 0.039507090633328135, + "grad_norm": 1.559270977973938, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6391, + "step": 2976 + }, + { + "epoch": 0.0399186228274253, + "grad_norm": 1.6405760049819946, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.6437, + "step": 3007 + }, + { + "epoch": 0.04033015502152247, + "grad_norm": 1.3599917888641357, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6436, + "step": 3038 + }, + { + "epoch": 0.04074168721561964, + "grad_norm": 1.3080962896347046, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6331, + "step": 3069 + }, + { + "epoch": 0.04115321940971681, + "grad_norm": 1.435195803642273, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6417, + "step": 3100 + }, + { + "epoch": 0.041564751603813975, + "grad_norm": 1.342986822128296, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6391, + "step": 3131 + }, + { + "epoch": 0.04197628379791114, + "grad_norm": 1.524418592453003, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6511, + "step": 3162 + }, + { + "epoch": 0.04238781599200831, + "grad_norm": 1.4617900848388672, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6367, + "step": 3193 + }, + { + "epoch": 0.04279934818610548, + "grad_norm": 1.3968737125396729, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.6387, + "step": 3224 + }, + { + "epoch": 0.04321088038020265, + "grad_norm": 1.258406400680542, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6356, + "step": 3255 + }, + { + "epoch": 0.043622412574299815, + "grad_norm": 1.5494844913482666, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6433, + "step": 3286 + }, + { + "epoch": 0.04403394476839698, + "grad_norm": 1.2248284816741943, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6376, + "step": 3317 + }, + { + "epoch": 0.04444547696249415, + "grad_norm": 1.4243425130844116, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6341, + "step": 3348 + }, + { + "epoch": 0.04485700915659132, + "grad_norm": 1.379364252090454, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6378, + "step": 3379 + }, + { + "epoch": 0.04526854135068849, + "grad_norm": 1.3460108041763306, + "learning_rate": 4.948079823064559e-05, + "loss": 0.639, + "step": 3410 + }, + { + "epoch": 0.045680073544785656, + "grad_norm": 1.382348656654358, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6327, + "step": 3441 + }, + { + "epoch": 0.046091605738882824, + "grad_norm": 1.40754234790802, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6234, + "step": 3472 + }, + { + "epoch": 0.04650313793297999, + "grad_norm": 1.502922773361206, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 0.04691467012707716, + "grad_norm": 1.2896559238433838, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6378, + "step": 3534 + }, + { + "epoch": 0.04732620232117433, + "grad_norm": 1.3114830255508423, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6373, + "step": 3565 + }, + { + "epoch": 0.047737734515271496, + "grad_norm": 1.4122483730316162, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6305, + "step": 3596 + }, + { + "epoch": 0.048149266709368664, + "grad_norm": 1.2669744491577148, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6413, + "step": 3627 + }, + { + "epoch": 0.04856079890346583, + "grad_norm": 1.2965583801269531, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6364, + "step": 3658 + }, + { + "epoch": 0.048972331097563, + "grad_norm": 1.3328967094421387, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6466, + "step": 3689 + }, + { + "epoch": 0.04938386329166017, + "grad_norm": 1.6291115283966064, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6348, + "step": 3720 + }, + { + "epoch": 0.049795395485757336, + "grad_norm": 1.3224235773086548, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6256, + "step": 3751 + }, + { + "epoch": 0.050206927679854504, + "grad_norm": 1.3253369331359863, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6303, + "step": 3782 + }, + { + "epoch": 0.05061845987395167, + "grad_norm": 1.2806897163391113, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.6403, + "step": 3813 + }, + { + "epoch": 0.05102999206804884, + "grad_norm": 1.3389384746551514, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6288, + "step": 3844 + }, + { + "epoch": 0.05144152426214601, + "grad_norm": 1.251550555229187, + "learning_rate": 4.919443027766935e-05, + "loss": 0.637, + "step": 3875 + }, + { + "epoch": 0.05185305645624318, + "grad_norm": 1.258484959602356, + "learning_rate": 4.91731463569418e-05, + "loss": 0.629, + "step": 3906 + }, + { + "epoch": 0.052264588650340345, + "grad_norm": 1.3574656248092651, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6262, + "step": 3937 + }, + { + "epoch": 0.05267612084443751, + "grad_norm": 1.327051043510437, + "learning_rate": 4.912976038673786e-05, + "loss": 0.629, + "step": 3968 + }, + { + "epoch": 0.05308765303853468, + "grad_norm": 1.3813196420669556, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6308, + "step": 3999 + }, + { + "epoch": 0.05349918523263185, + "grad_norm": 1.2064168453216553, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6321, + "step": 4030 + }, + { + "epoch": 0.05391071742672902, + "grad_norm": 1.2771285772323608, + "learning_rate": 4.906263980464644e-05, + "loss": 0.622, + "step": 4061 + }, + { + "epoch": 0.054322249620826185, + "grad_norm": 1.1788181066513062, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6268, + "step": 4092 + }, + { + "epoch": 0.05473378181492335, + "grad_norm": 1.3975950479507446, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6251, + "step": 4123 + }, + { + "epoch": 0.05514531400902052, + "grad_norm": 1.2083688974380493, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6287, + "step": 4154 + }, + { + "epoch": 0.05555684620311769, + "grad_norm": 1.2795994281768799, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6302, + "step": 4185 + }, + { + "epoch": 0.05596837839721486, + "grad_norm": 1.1056169271469116, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6256, + "step": 4216 + }, + { + "epoch": 0.056379910591312025, + "grad_norm": 1.415405511856079, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6334, + "step": 4247 + }, + { + "epoch": 0.056791442785409194, + "grad_norm": 1.2858442068099976, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6248, + "step": 4278 + }, + { + "epoch": 0.05720297497950636, + "grad_norm": 1.3914793729782104, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6229, + "step": 4309 + }, + { + "epoch": 0.05761450717360353, + "grad_norm": 1.1236270666122437, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6272, + "step": 4340 + }, + { + "epoch": 0.0580260393677007, + "grad_norm": 1.393020510673523, + "learning_rate": 4.882129447892753e-05, + "loss": 0.624, + "step": 4371 + }, + { + "epoch": 0.058437571561797866, + "grad_norm": 1.3223553895950317, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6267, + "step": 4402 + }, + { + "epoch": 0.058849103755895034, + "grad_norm": 1.2308725118637085, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6255, + "step": 4433 + }, + { + "epoch": 0.0592606359499922, + "grad_norm": 1.3741412162780762, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6246, + "step": 4464 + }, + { + "epoch": 0.05967216814408937, + "grad_norm": 1.2487592697143555, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6221, + "step": 4495 + }, + { + "epoch": 0.06008370033818654, + "grad_norm": 1.3239238262176514, + "learning_rate": 4.869052379269719e-05, + "loss": 0.621, + "step": 4526 + }, + { + "epoch": 0.060495232532283706, + "grad_norm": 1.1411608457565308, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6211, + "step": 4557 + }, + { + "epoch": 0.060906764726380874, + "grad_norm": 1.1687365770339966, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6159, + "step": 4588 + }, + { + "epoch": 0.06131829692047804, + "grad_norm": 1.25679349899292, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 0.06172982911457521, + "grad_norm": 1.2380110025405884, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6305, + "step": 4650 + }, + { + "epoch": 0.06214136130867238, + "grad_norm": 1.3054466247558594, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6202, + "step": 4681 + }, + { + "epoch": 0.06255289350276955, + "grad_norm": 1.3628545999526978, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6192, + "step": 4712 + }, + { + "epoch": 0.06296442569686671, + "grad_norm": 1.3576844930648804, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6213, + "step": 4743 + }, + { + "epoch": 0.06337595789096388, + "grad_norm": 1.3126753568649292, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6198, + "step": 4774 + }, + { + "epoch": 0.06378749008506104, + "grad_norm": 1.2266637086868286, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6229, + "step": 4805 + }, + { + "epoch": 0.06419902227915822, + "grad_norm": 1.4964330196380615, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6264, + "step": 4836 + }, + { + "epoch": 0.06461055447325538, + "grad_norm": 1.1138426065444946, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6258, + "step": 4867 + }, + { + "epoch": 0.06502208666735255, + "grad_norm": 1.2055357694625854, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6263, + "step": 4898 + }, + { + "epoch": 0.06543361886144972, + "grad_norm": 1.2684381008148193, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6212, + "step": 4929 + }, + { + "epoch": 0.06584515105554689, + "grad_norm": 1.2323859930038452, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6253, + "step": 4960 + }, + { + "epoch": 0.06625668324964405, + "grad_norm": 1.091818928718567, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6127, + "step": 4991 + }, + { + "epoch": 0.06666821544374123, + "grad_norm": 1.2208534479141235, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6179, + "step": 5022 + }, + { + "epoch": 0.06707974763783839, + "grad_norm": 1.3426082134246826, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6246, + "step": 5053 + }, + { + "epoch": 0.06749127983193556, + "grad_norm": 1.2961252927780151, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.6149, + "step": 5084 + }, + { + "epoch": 0.06790281202603272, + "grad_norm": 1.2990977764129639, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6303, + "step": 5115 + }, + { + "epoch": 0.0683143442201299, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6234, + "step": 5146 + }, + { + "epoch": 0.06872587641422706, + "grad_norm": 1.2031605243682861, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6212, + "step": 5177 + }, + { + "epoch": 0.06913740860832424, + "grad_norm": 1.2580938339233398, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6144, + "step": 5208 + }, + { + "epoch": 0.0695489408024214, + "grad_norm": 1.203706979751587, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6259, + "step": 5239 + }, + { + "epoch": 0.06996047299651857, + "grad_norm": 1.421762466430664, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6221, + "step": 5270 + }, + { + "epoch": 0.07037200519061573, + "grad_norm": 1.326038122177124, + "learning_rate": 4.793722210363262e-05, + "loss": 0.6152, + "step": 5301 + }, + { + "epoch": 0.07078353738471291, + "grad_norm": 1.361939787864685, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6206, + "step": 5332 + }, + { + "epoch": 0.07119506957881007, + "grad_norm": 1.2153042554855347, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6209, + "step": 5363 + }, + { + "epoch": 0.07160660177290724, + "grad_norm": 1.3737229108810425, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6121, + "step": 5394 + }, + { + "epoch": 0.0720181339670044, + "grad_norm": 1.1781859397888184, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6183, + "step": 5425 + }, + { + "epoch": 0.07242966616110158, + "grad_norm": 1.069472312927246, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.6117, + "step": 5456 + }, + { + "epoch": 0.07284119835519874, + "grad_norm": 1.380937099456787, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6175, + "step": 5487 + }, + { + "epoch": 0.07325273054929592, + "grad_norm": 1.1551166772842407, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6164, + "step": 5518 + }, + { + "epoch": 0.07366426274339308, + "grad_norm": 1.2816351652145386, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6168, + "step": 5549 + }, + { + "epoch": 0.07407579493749025, + "grad_norm": 1.0833333730697632, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6159, + "step": 5580 + }, + { + "epoch": 0.07448732713158741, + "grad_norm": 1.2533819675445557, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6141, + "step": 5611 + }, + { + "epoch": 0.07489885932568459, + "grad_norm": 1.1354937553405762, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.612, + "step": 5642 + }, + { + "epoch": 0.07531039151978175, + "grad_norm": 1.2579649686813354, + "learning_rate": 4.751783684659e-05, + "loss": 0.6147, + "step": 5673 + }, + { + "epoch": 0.07572192371387892, + "grad_norm": 1.2830709218978882, + "learning_rate": 4.748122674890348e-05, + "loss": 0.62, + "step": 5704 + }, + { + "epoch": 0.07613345590797609, + "grad_norm": 1.2012654542922974, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6199, + "step": 5735 + }, + { + "epoch": 0.07654498810207326, + "grad_norm": 1.2926653623580933, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6104, + "step": 5766 + }, + { + "epoch": 0.07695652029617042, + "grad_norm": 1.2043049335479736, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6261, + "step": 5797 + }, + { + "epoch": 0.0773680524902676, + "grad_norm": 1.1188119649887085, + "learning_rate": 4.733225355658999e-05, + "loss": 0.6056, + "step": 5828 + }, + { + "epoch": 0.07777958468436476, + "grad_norm": 1.1217443943023682, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.6124, + "step": 5859 + }, + { + "epoch": 0.07819111687846193, + "grad_norm": 1.1623468399047852, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6191, + "step": 5890 + }, + { + "epoch": 0.0786026490725591, + "grad_norm": 1.1147449016571045, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6091, + "step": 5921 + }, + { + "epoch": 0.07901418126665627, + "grad_norm": 1.0523818731307983, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6151, + "step": 5952 + }, + { + "epoch": 0.07942571346075343, + "grad_norm": 1.3112642765045166, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6106, + "step": 5983 + }, + { + "epoch": 0.0798372456548506, + "grad_norm": 1.1637545824050903, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6131, + "step": 6014 + }, + { + "epoch": 0.08024877784894777, + "grad_norm": 1.258862018585205, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6119, + "step": 6045 + }, + { + "epoch": 0.08066031004304494, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6149, + "step": 6076 + }, + { + "epoch": 0.0810718422371421, + "grad_norm": 1.1576476097106934, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6076, + "step": 6107 + }, + { + "epoch": 0.08148337443123928, + "grad_norm": 1.1076856851577759, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6105, + "step": 6138 + }, + { + "epoch": 0.08189490662533644, + "grad_norm": 14.014957427978516, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6189, + "step": 6169 + }, + { + "epoch": 0.08230643881943361, + "grad_norm": 1.1413066387176514, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6038, + "step": 6200 + }, + { + "epoch": 0.08271797101353077, + "grad_norm": 1.2316724061965942, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6125, + "step": 6231 + }, + { + "epoch": 0.08312950320762795, + "grad_norm": 1.0919370651245117, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6081, + "step": 6262 + }, + { + "epoch": 0.08354103540172511, + "grad_norm": 1.139345407485962, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6078, + "step": 6293 + }, + { + "epoch": 0.08395256759582229, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6071, + "step": 6324 + }, + { + "epoch": 0.08436409978991945, + "grad_norm": 1.2546321153640747, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6092, + "step": 6355 + }, + { + "epoch": 0.08477563198401662, + "grad_norm": 1.195173740386963, + "learning_rate": 4.661238099862658e-05, + "loss": 0.5998, + "step": 6386 + }, + { + "epoch": 0.08518716417811378, + "grad_norm": 1.2116323709487915, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6099, + "step": 6417 + }, + { + "epoch": 0.08559869637221096, + "grad_norm": 1.131951928138733, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6105, + "step": 6448 + }, + { + "epoch": 0.08601022856630812, + "grad_norm": 1.2176560163497925, + "learning_rate": 4.648464661063478e-05, + "loss": 0.6045, + "step": 6479 + }, + { + "epoch": 0.0864217607604053, + "grad_norm": 1.15361750125885, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6067, + "step": 6510 + }, + { + "epoch": 0.08683329295450246, + "grad_norm": 1.1422772407531738, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.605, + "step": 6541 + }, + { + "epoch": 0.08724482514859963, + "grad_norm": 1.250319004058838, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6069, + "step": 6572 + }, + { + "epoch": 0.08765635734269679, + "grad_norm": 1.0947929620742798, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6099, + "step": 6603 + }, + { + "epoch": 0.08806788953679397, + "grad_norm": 1.2815848588943481, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6132, + "step": 6634 + }, + { + "epoch": 0.08847942173089113, + "grad_norm": 1.196083664894104, + "learning_rate": 4.622264489304762e-05, + "loss": 0.604, + "step": 6665 + }, + { + "epoch": 0.0888909539249883, + "grad_norm": 1.1473642587661743, + "learning_rate": 4.617813681048434e-05, + "loss": 0.6081, + "step": 6696 + }, + { + "epoch": 0.08930248611908546, + "grad_norm": 1.182445764541626, + "learning_rate": 4.61333897355256e-05, + "loss": 0.6108, + "step": 6727 + }, + { + "epoch": 0.08971401831318264, + "grad_norm": 1.2859700918197632, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6074, + "step": 6758 + }, + { + "epoch": 0.0901255505072798, + "grad_norm": 1.1134952306747437, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6139, + "step": 6789 + }, + { + "epoch": 0.09053708270137698, + "grad_norm": 1.2447940111160278, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6049, + "step": 6820 + }, + { + "epoch": 0.09094861489547414, + "grad_norm": 1.042465329170227, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6113, + "step": 6851 + }, + { + "epoch": 0.09136014708957131, + "grad_norm": 1.2242035865783691, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6077, + "step": 6882 + }, + { + "epoch": 0.09177167928366847, + "grad_norm": 1.2531142234802246, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6081, + "step": 6913 + }, + { + "epoch": 0.09218321147776565, + "grad_norm": 1.194645881652832, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6158, + "step": 6944 + }, + { + "epoch": 0.09259474367186281, + "grad_norm": 1.1052149534225464, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6079, + "step": 6975 + }, + { + "epoch": 0.09300627586595998, + "grad_norm": 1.1275289058685303, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6107, + "step": 7006 + }, + { + "epoch": 0.09341780806005714, + "grad_norm": 1.251237154006958, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6028, + "step": 7037 + }, + { + "epoch": 0.09382934025415432, + "grad_norm": 1.1206951141357422, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6035, + "step": 7068 + }, + { + "epoch": 0.09424087244825148, + "grad_norm": 1.2242387533187866, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.5959, + "step": 7099 + }, + { + "epoch": 0.09465240464234866, + "grad_norm": 1.0749527215957642, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6005, + "step": 7130 + }, + { + "epoch": 0.09506393683644582, + "grad_norm": 1.183052659034729, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6102, + "step": 7161 + }, + { + "epoch": 0.09547546903054299, + "grad_norm": 1.1045013666152954, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6056, + "step": 7192 + }, + { + "epoch": 0.09588700122464015, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6024, + "step": 7223 + }, + { + "epoch": 0.09629853341873733, + "grad_norm": 1.2083353996276855, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6062, + "step": 7254 + }, + { + "epoch": 0.09671006561283449, + "grad_norm": 1.1391205787658691, + "learning_rate": 4.528766329570536e-05, + "loss": 0.5981, + "step": 7285 + }, + { + "epoch": 0.09712159780693166, + "grad_norm": 1.1213765144348145, + "learning_rate": 4.523847434837447e-05, + "loss": 0.5964, + "step": 7316 + }, + { + "epoch": 0.09753313000102883, + "grad_norm": 1.1574114561080933, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6035, + "step": 7347 + }, + { + "epoch": 0.097944662195126, + "grad_norm": 1.0101516246795654, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6036, + "step": 7378 + }, + { + "epoch": 0.09835619438922316, + "grad_norm": 1.1439381837844849, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6049, + "step": 7409 + }, + { + "epoch": 0.09876772658332034, + "grad_norm": 1.0668189525604248, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6035, + "step": 7440 + }, + { + "epoch": 0.0991792587774175, + "grad_norm": 1.2038415670394897, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6062, + "step": 7471 + }, + { + "epoch": 0.09959079097151467, + "grad_norm": 1.1965882778167725, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6041, + "step": 7502 + }, + { + "epoch": 0.10000232316561183, + "grad_norm": 1.5885149240493774, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6129, + "step": 7533 + }, + { + "epoch": 0.10041385535970901, + "grad_norm": 1.058237910270691, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.5957, + "step": 7564 + }, + { + "epoch": 0.10082538755380617, + "grad_norm": 1.2043278217315674, + "learning_rate": 4.478556380141218e-05, + "loss": 0.5943, + "step": 7595 + }, + { + "epoch": 0.10123691974790335, + "grad_norm": 1.1602888107299805, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.5971, + "step": 7626 + }, + { + "epoch": 0.1016484519420005, + "grad_norm": 1.1536785364151, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.604, + "step": 7657 + }, + { + "epoch": 0.10205998413609768, + "grad_norm": 1.1859934329986572, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6043, + "step": 7688 + }, + { + "epoch": 0.10247151633019484, + "grad_norm": 1.0551954507827759, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6008, + "step": 7719 + }, + { + "epoch": 0.10288304852429202, + "grad_norm": 1.015857458114624, + "learning_rate": 4.452610552959183e-05, + "loss": 0.5961, + "step": 7750 + }, + { + "epoch": 0.10329458071838918, + "grad_norm": 1.0759170055389404, + "learning_rate": 4.447355047201428e-05, + "loss": 0.6025, + "step": 7781 + }, + { + "epoch": 0.10370611291248635, + "grad_norm": 1.025030255317688, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.602, + "step": 7812 + }, + { + "epoch": 0.10411764510658351, + "grad_norm": 1.0884742736816406, + "learning_rate": 4.436778168330484e-05, + "loss": 0.5989, + "step": 7843 + }, + { + "epoch": 0.10452917730068069, + "grad_norm": 1.1207304000854492, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.6016, + "step": 7874 + }, + { + "epoch": 0.10494070949477785, + "grad_norm": 0.9768222570419312, + "learning_rate": 4.42611386459262e-05, + "loss": 0.605, + "step": 7905 + }, + { + "epoch": 0.10535224168887503, + "grad_norm": 1.0720235109329224, + "learning_rate": 4.420749078676133e-05, + "loss": 0.5971, + "step": 7936 + }, + { + "epoch": 0.10576377388297219, + "grad_norm": 1.1767033338546753, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.5877, + "step": 7967 + }, + { + "epoch": 0.10617530607706936, + "grad_norm": 1.3390216827392578, + "learning_rate": 4.409954541451762e-05, + "loss": 0.5973, + "step": 7998 + }, + { + "epoch": 0.10658683827116652, + "grad_norm": 1.7948216199874878, + "learning_rate": 4.404524911958764e-05, + "loss": 0.5922, + "step": 8029 + }, + { + "epoch": 0.1069983704652637, + "grad_norm": 1.177847981452942, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6002, + "step": 8060 + }, + { + "epoch": 0.10740990265936086, + "grad_norm": 1.1014128923416138, + "learning_rate": 4.393601237573607e-05, + "loss": 0.5945, + "step": 8091 + }, + { + "epoch": 0.10782143485345803, + "grad_norm": 1.067475438117981, + "learning_rate": 4.388107315953628e-05, + "loss": 0.5996, + "step": 8122 + }, + { + "epoch": 0.1082329670475552, + "grad_norm": 1.1241830587387085, + "learning_rate": 4.382592087299212e-05, + "loss": 0.5973, + "step": 8153 + }, + { + "epoch": 0.10864449924165237, + "grad_norm": 1.072426438331604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.601, + "step": 8184 + }, + { + "epoch": 0.10905603143574953, + "grad_norm": 1.0138245820999146, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6034, + "step": 8215 + }, + { + "epoch": 0.1094675636298467, + "grad_norm": 0.9993209838867188, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6024, + "step": 8246 + }, + { + "epoch": 0.10987909582394387, + "grad_norm": 1.157468318939209, + "learning_rate": 4.360319350701226e-05, + "loss": 0.5987, + "step": 8277 + }, + { + "epoch": 0.11029062801804104, + "grad_norm": 1.2073496580123901, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6023, + "step": 8308 + }, + { + "epoch": 0.1107021602121382, + "grad_norm": 1.0800622701644897, + "learning_rate": 4.349056769754021e-05, + "loss": 0.5993, + "step": 8339 + }, + { + "epoch": 0.11111369240623538, + "grad_norm": 1.1365702152252197, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.595, + "step": 8370 + }, + { + "epoch": 0.11152522460033254, + "grad_norm": 1.1367030143737793, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.5921, + "step": 8401 + }, + { + "epoch": 0.11193675679442971, + "grad_norm": 1.0323926210403442, + "learning_rate": 4.332006561018488e-05, + "loss": 0.5883, + "step": 8432 + }, + { + "epoch": 0.11234828898852688, + "grad_norm": 1.2352523803710938, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.5957, + "step": 8463 + }, + { + "epoch": 0.11275982118262405, + "grad_norm": 1.1763888597488403, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.5941, + "step": 8494 + }, + { + "epoch": 0.11317135337672121, + "grad_norm": 1.1331796646118164, + "learning_rate": 4.314770288177384e-05, + "loss": 0.5993, + "step": 8525 + }, + { + "epoch": 0.11358288557081839, + "grad_norm": 1.146270990371704, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6041, + "step": 8556 + }, + { + "epoch": 0.11399441776491555, + "grad_norm": 1.0433791875839233, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.5918, + "step": 8587 + }, + { + "epoch": 0.11440594995901272, + "grad_norm": 1.2013510465621948, + "learning_rate": 4.297349701798505e-05, + "loss": 0.6031, + "step": 8618 + }, + { + "epoch": 0.11481748215310988, + "grad_norm": 1.1263506412506104, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.597, + "step": 8649 + }, + { + "epoch": 0.11522901434720706, + "grad_norm": 1.1360443830490112, + "learning_rate": 4.285634454093198e-05, + "loss": 0.5909, + "step": 8680 + }, + { + "epoch": 0.11564054654130422, + "grad_norm": 1.0095175504684448, + "learning_rate": 4.279746571169086e-05, + "loss": 0.59, + "step": 8711 + }, + { + "epoch": 0.1160520787354014, + "grad_norm": 1.0894793272018433, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.5894, + "step": 8742 + }, + { + "epoch": 0.11646361092949856, + "grad_norm": 1.1603643894195557, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.5924, + "step": 8773 + }, + { + "epoch": 0.11687514312359573, + "grad_norm": 1.177464485168457, + "learning_rate": 4.261962684116106e-05, + "loss": 0.5946, + "step": 8804 + }, + { + "epoch": 0.11728667531769289, + "grad_norm": 0.9632905125617981, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.5944, + "step": 8835 + }, + { + "epoch": 0.11769820751179007, + "grad_norm": 1.0413568019866943, + "learning_rate": 4.250007230372134e-05, + "loss": 0.5844, + "step": 8866 + }, + { + "epoch": 0.11810973970588723, + "grad_norm": 1.0879931449890137, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.588, + "step": 8897 + }, + { + "epoch": 0.1185212718999844, + "grad_norm": 1.0681732892990112, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5832, + "step": 8928 + }, + { + "epoch": 0.11893280409408157, + "grad_norm": 1.0609339475631714, + "learning_rate": 4.231926105340768e-05, + "loss": 0.5886, + "step": 8959 + }, + { + "epoch": 0.11934433628817874, + "grad_norm": 1.4936331510543823, + "learning_rate": 4.225859883654776e-05, + "loss": 0.5868, + "step": 8990 + }, + { + "epoch": 0.1197558684822759, + "grad_norm": 1.1860368251800537, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5904, + "step": 9021 + }, + { + "epoch": 0.12016740067637308, + "grad_norm": 1.1657029390335083, + "learning_rate": 4.213669080676418e-05, + "loss": 0.5872, + "step": 9052 + }, + { + "epoch": 0.12057893287047024, + "grad_norm": 1.0721909999847412, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.5977, + "step": 9083 + }, + { + "epoch": 0.12099046506456741, + "grad_norm": 1.1832354068756104, + "learning_rate": 4.201400923825648e-05, + "loss": 0.5902, + "step": 9114 + }, + { + "epoch": 0.12140199725866457, + "grad_norm": 1.1306614875793457, + "learning_rate": 4.195238010617511e-05, + "loss": 0.5931, + "step": 9145 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.754968236539773e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9156/training_args.bin b/checkpoint-9156/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3d87fcba007ec2448c21323b9fdb9fa6105de5f --- /dev/null +++ b/checkpoint-9156/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5225534ee746c36dba9f28281e0b858e6e9b4a308bdaf1de4f68e79e9ee83 +size 5304 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/model-00001-of-00007.safetensors b/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..33d4f7e14e7a8a2afbe7b7c06a024ee7f26f0cca --- /dev/null +++ b/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf30dec03b2688f7bed8ee5bff34fb048ce268ccc24d6c1ccf12106f6bbdd988 +size 4886466168 diff --git a/model-00002-of-00007.safetensors b/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/model-00003-of-00007.safetensors b/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/model-00004-of-00007.safetensors b/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/model-00005-of-00007.safetensors b/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/model-00006-of-00007.safetensors b/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c8b9d6763015190745a246c62d5b2cd6f92bbe8f --- /dev/null +++ b/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95eb8eb3fc2bb6c30768b266b4ef96f92214652eb07eb836fe234961e211ff85 +size 4999813120 diff --git a/model-00007-of-00007.safetensors b/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5e21baf096d1a4fe82e5bdcf0b51239463e734f --- /dev/null +++ b/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec98d73751d8916ceeed0ae8aa58fbd1b2a78a65b277251427c8cdd6368bd1c8 +size 2571158184 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,410563 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 128000, + "content": "<|begin_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128001, + "content": "<|end_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128002, + "content": "<|reserved_special_token_0|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128003, + "content": "<|reserved_special_token_1|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128004, + "content": "<|finetune_right_pad_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128005, + "content": "<|reserved_special_token_2|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128006, + "content": "<|start_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128007, + "content": "<|end_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128008, + "content": "<|eom_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128009, + "content": "<|eot_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128010, + "content": "<|python_tag|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128011, + "content": "<|reserved_special_token_3|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128012, + "content": "<|reserved_special_token_4|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128013, + "content": "<|reserved_special_token_5|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128014, + "content": "<|reserved_special_token_6|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128015, + "content": "<|reserved_special_token_7|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128016, + "content": "<|reserved_special_token_8|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128017, + "content": "<|reserved_special_token_9|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128018, + "content": "<|reserved_special_token_10|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128019, + "content": "<|reserved_special_token_11|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128020, + "content": "<|reserved_special_token_12|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128021, + "content": "<|reserved_special_token_13|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128022, + "content": "<|reserved_special_token_14|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128023, + "content": "<|reserved_special_token_15|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128024, + "content": "<|reserved_special_token_16|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128025, + "content": "<|reserved_special_token_17|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128026, + "content": "<|reserved_special_token_18|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128027, + "content": "<|reserved_special_token_19|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128028, + "content": "<|reserved_special_token_20|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128029, + "content": "<|reserved_special_token_21|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128030, + "content": "<|reserved_special_token_22|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128031, + "content": "<|reserved_special_token_23|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128032, + "content": "<|reserved_special_token_24|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128033, + "content": "<|reserved_special_token_25|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128034, + "content": "<|reserved_special_token_26|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128035, + "content": "<|reserved_special_token_27|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128036, + "content": "<|reserved_special_token_28|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128037, + "content": "<|reserved_special_token_29|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128038, + "content": "<|reserved_special_token_30|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128039, + "content": "<|reserved_special_token_31|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128040, + "content": "<|reserved_special_token_32|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128041, + "content": "<|reserved_special_token_33|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128042, + "content": "<|reserved_special_token_34|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128043, + "content": "<|reserved_special_token_35|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128044, + "content": "<|reserved_special_token_36|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128045, + "content": "<|reserved_special_token_37|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128046, + "content": "<|reserved_special_token_38|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128047, + "content": "<|reserved_special_token_39|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128048, + "content": "<|reserved_special_token_40|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128049, + "content": "<|reserved_special_token_41|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128050, + "content": "<|reserved_special_token_42|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128051, + "content": "<|reserved_special_token_43|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128052, + "content": "<|reserved_special_token_44|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128053, + "content": "<|reserved_special_token_45|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128054, + "content": "<|reserved_special_token_46|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128055, + "content": "<|reserved_special_token_47|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128056, + "content": "<|reserved_special_token_48|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128057, + "content": "<|reserved_special_token_49|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128058, + "content": "<|reserved_special_token_50|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128059, + "content": "<|reserved_special_token_51|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128060, + "content": "<|reserved_special_token_52|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128061, + "content": "<|reserved_special_token_53|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128062, + "content": "<|reserved_special_token_54|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128063, + "content": "<|reserved_special_token_55|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128064, + "content": "<|reserved_special_token_56|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128065, + "content": "<|reserved_special_token_57|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128066, + "content": "<|reserved_special_token_58|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128067, + "content": "<|reserved_special_token_59|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128068, + "content": "<|reserved_special_token_60|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128069, + "content": "<|reserved_special_token_61|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128070, + "content": "<|reserved_special_token_62|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128071, + "content": "<|reserved_special_token_63|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128072, + "content": "<|reserved_special_token_64|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128073, + "content": "<|reserved_special_token_65|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128074, + "content": "<|reserved_special_token_66|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128075, + "content": "<|reserved_special_token_67|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128076, + "content": "<|reserved_special_token_68|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128077, + "content": "<|reserved_special_token_69|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128078, + "content": "<|reserved_special_token_70|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128079, + "content": "<|reserved_special_token_71|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128080, + "content": "<|reserved_special_token_72|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128081, + "content": "<|reserved_special_token_73|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128082, + "content": "<|reserved_special_token_74|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128083, + "content": "<|reserved_special_token_75|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128084, + "content": "<|reserved_special_token_76|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128085, + "content": "<|reserved_special_token_77|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128086, + "content": "<|reserved_special_token_78|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128087, + "content": "<|reserved_special_token_79|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128088, + "content": "<|reserved_special_token_80|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128089, + "content": "<|reserved_special_token_81|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128090, + "content": "<|reserved_special_token_82|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128091, + "content": "<|reserved_special_token_83|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128092, + "content": "<|reserved_special_token_84|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128093, + "content": "<|reserved_special_token_85|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128094, + "content": "<|reserved_special_token_86|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128095, + "content": "<|reserved_special_token_87|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128096, + "content": "<|reserved_special_token_88|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128097, + "content": "<|reserved_special_token_89|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128098, + "content": "<|reserved_special_token_90|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128099, + "content": "<|reserved_special_token_91|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128100, + "content": "<|reserved_special_token_92|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128101, + "content": "<|reserved_special_token_93|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128102, + "content": "<|reserved_special_token_94|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128103, + "content": "<|reserved_special_token_95|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128104, + "content": "<|reserved_special_token_96|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128105, + "content": "<|reserved_special_token_97|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128106, + "content": "<|reserved_special_token_98|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128107, + "content": "<|reserved_special_token_99|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128108, + "content": "<|reserved_special_token_100|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128109, + "content": "<|reserved_special_token_101|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128110, + "content": "<|reserved_special_token_102|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128111, + "content": "<|reserved_special_token_103|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128112, + "content": "<|reserved_special_token_104|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128113, + "content": "<|reserved_special_token_105|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128114, + "content": "<|reserved_special_token_106|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128115, + "content": "<|reserved_special_token_107|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128116, + "content": "<|reserved_special_token_108|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128117, + "content": "<|reserved_special_token_109|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128118, + "content": "<|reserved_special_token_110|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128119, + "content": "<|reserved_special_token_111|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128120, + "content": "<|reserved_special_token_112|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128121, + "content": "<|reserved_special_token_113|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128122, + "content": "<|reserved_special_token_114|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128123, + "content": "<|reserved_special_token_115|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128124, + "content": "<|reserved_special_token_116|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128125, + "content": "<|reserved_special_token_117|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128126, + "content": "<|reserved_special_token_118|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128127, + "content": "<|reserved_special_token_119|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128128, + "content": "<|reserved_special_token_120|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128129, + "content": "<|reserved_special_token_121|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128130, + "content": "<|reserved_special_token_122|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128131, + "content": "<|reserved_special_token_123|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128132, + "content": "<|reserved_special_token_124|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128133, + "content": "<|reserved_special_token_125|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128134, + "content": "<|reserved_special_token_126|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128135, + "content": "<|reserved_special_token_127|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128136, + "content": "<|reserved_special_token_128|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128137, + "content": "<|reserved_special_token_129|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128138, + "content": "<|reserved_special_token_130|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128139, + "content": "<|reserved_special_token_131|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128140, + "content": "<|reserved_special_token_132|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128141, + "content": "<|reserved_special_token_133|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128142, + "content": "<|reserved_special_token_134|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128143, + "content": "<|reserved_special_token_135|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128144, + "content": "<|reserved_special_token_136|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128145, + "content": "<|reserved_special_token_137|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128146, + "content": "<|reserved_special_token_138|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128147, + "content": "<|reserved_special_token_139|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128148, + "content": "<|reserved_special_token_140|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128149, + "content": "<|reserved_special_token_141|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128150, + "content": "<|reserved_special_token_142|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128151, + "content": "<|reserved_special_token_143|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128152, + "content": "<|reserved_special_token_144|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128153, + "content": "<|reserved_special_token_145|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128154, + "content": "<|reserved_special_token_146|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128155, + "content": "<|reserved_special_token_147|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128156, + "content": "<|reserved_special_token_148|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128157, + "content": "<|reserved_special_token_149|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128158, + "content": "<|reserved_special_token_150|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128159, + "content": "<|reserved_special_token_151|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128160, + "content": "<|reserved_special_token_152|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128161, + "content": "<|reserved_special_token_153|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128162, + "content": "<|reserved_special_token_154|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128163, + "content": "<|reserved_special_token_155|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128164, + "content": "<|reserved_special_token_156|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128165, + "content": "<|reserved_special_token_157|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128166, + "content": "<|reserved_special_token_158|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128167, + "content": "<|reserved_special_token_159|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128168, + "content": "<|reserved_special_token_160|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128169, + "content": "<|reserved_special_token_161|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128170, + "content": "<|reserved_special_token_162|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128171, + "content": "<|reserved_special_token_163|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128172, + "content": "<|reserved_special_token_164|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128173, + "content": "<|reserved_special_token_165|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128174, + "content": "<|reserved_special_token_166|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128175, + "content": "<|reserved_special_token_167|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128176, + "content": "<|reserved_special_token_168|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128177, + "content": "<|reserved_special_token_169|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128178, + "content": "<|reserved_special_token_170|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128179, + "content": "<|reserved_special_token_171|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128180, + "content": "<|reserved_special_token_172|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128181, + "content": "<|reserved_special_token_173|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128182, + "content": "<|reserved_special_token_174|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128183, + "content": "<|reserved_special_token_175|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128184, + "content": "<|reserved_special_token_176|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128185, + "content": "<|reserved_special_token_177|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128186, + "content": "<|reserved_special_token_178|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128187, + "content": "<|reserved_special_token_179|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128188, + "content": "<|reserved_special_token_180|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128189, + "content": "<|reserved_special_token_181|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128190, + "content": "<|reserved_special_token_182|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128191, + "content": "<|reserved_special_token_183|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128192, + "content": "<|reserved_special_token_184|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128193, + "content": "<|reserved_special_token_185|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128194, + "content": "<|reserved_special_token_186|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128195, + "content": "<|reserved_special_token_187|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128196, + "content": "<|reserved_special_token_188|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128197, + "content": "<|reserved_special_token_189|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128198, + "content": "<|reserved_special_token_190|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128199, + "content": "<|reserved_special_token_191|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128200, + "content": "<|reserved_special_token_192|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128201, + "content": "<|reserved_special_token_193|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128202, + "content": "<|reserved_special_token_194|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128203, + "content": "<|reserved_special_token_195|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128204, + "content": "<|reserved_special_token_196|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128205, + "content": "<|reserved_special_token_197|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128206, + "content": "<|reserved_special_token_198|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128207, + "content": "<|reserved_special_token_199|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128208, + "content": "<|reserved_special_token_200|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128209, + "content": "<|reserved_special_token_201|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128210, + "content": "<|reserved_special_token_202|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128211, + "content": "<|reserved_special_token_203|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128212, + "content": "<|reserved_special_token_204|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128213, + "content": "<|reserved_special_token_205|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128214, + "content": "<|reserved_special_token_206|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128215, + "content": "<|reserved_special_token_207|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128216, + "content": "<|reserved_special_token_208|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128217, + "content": "<|reserved_special_token_209|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128218, + "content": "<|reserved_special_token_210|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128219, + "content": "<|reserved_special_token_211|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128220, + "content": "<|reserved_special_token_212|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128221, + "content": "<|reserved_special_token_213|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128222, + "content": "<|reserved_special_token_214|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128223, + "content": "<|reserved_special_token_215|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128224, + "content": "<|reserved_special_token_216|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128225, + "content": "<|reserved_special_token_217|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128226, + "content": "<|reserved_special_token_218|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128227, + "content": "<|reserved_special_token_219|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128228, + "content": "<|reserved_special_token_220|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128229, + "content": "<|reserved_special_token_221|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128230, + "content": "<|reserved_special_token_222|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128231, + "content": "<|reserved_special_token_223|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128232, + "content": "<|reserved_special_token_224|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128233, + "content": "<|reserved_special_token_225|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128234, + "content": "<|reserved_special_token_226|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128235, + "content": "<|reserved_special_token_227|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128236, + "content": "<|reserved_special_token_228|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128237, + "content": "<|reserved_special_token_229|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128238, + "content": "<|reserved_special_token_230|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128239, + "content": "<|reserved_special_token_231|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128240, + "content": "<|reserved_special_token_232|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128241, + "content": "<|reserved_special_token_233|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128242, + "content": "<|reserved_special_token_234|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128243, + "content": "<|reserved_special_token_235|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128244, + "content": "<|reserved_special_token_236|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128245, + "content": "<|reserved_special_token_237|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128246, + "content": "<|reserved_special_token_238|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128247, + "content": "<|reserved_special_token_239|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128248, + "content": "<|reserved_special_token_240|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128249, + "content": "<|reserved_special_token_241|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128250, + "content": "<|reserved_special_token_242|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128251, + "content": "<|reserved_special_token_243|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128252, + "content": "<|reserved_special_token_244|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128253, + "content": "<|reserved_special_token_245|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128254, + "content": "<|reserved_special_token_246|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128255, + "content": "<|reserved_special_token_247|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 128000 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": true, + "vocab": { + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "¡": 94, + "¢": 95, + "£": 96, + "¤": 97, + "¥": 98, + "¦": 99, + "§": 100, + "¨": 101, + "©": 102, + "ª": 103, + "«": 104, + "¬": 105, + "®": 106, + "¯": 107, + "°": 108, + "±": 109, + "²": 110, + "³": 111, + "´": 112, + "µ": 113, + "¶": 114, + "·": 115, + "¸": 116, + "¹": 117, + "º": 118, + "»": 119, + "¼": 120, + "½": 121, + "¾": 122, + "¿": 123, + "À": 124, + "Á": 125, + "Â": 126, + "Ã": 127, + "Ä": 128, + "Å": 129, + "Æ": 130, + "Ç": 131, + "È": 132, + "É": 133, + "Ê": 134, + "Ë": 135, + "Ì": 136, + "Í": 137, + "Î": 138, + "Ï": 139, + "Ð": 140, + "Ñ": 141, + "Ò": 142, + "Ó": 143, + "Ô": 144, + "Õ": 145, + "Ö": 146, + "×": 147, + "Ø": 148, + "Ù": 149, + "Ú": 150, + "Û": 151, + "Ü": 152, + "Ý": 153, + "Þ": 154, + "ß": 155, + "à": 156, + "á": 157, + "â": 158, + "ã": 159, + "ä": 160, + "å": 161, + "æ": 162, + "ç": 163, + "è": 164, + "é": 165, + "ê": 166, + "ë": 167, + "ì": 168, + "í": 169, + "î": 170, + "ï": 171, + "ð": 172, + "ñ": 173, + "ò": 174, + "ó": 175, + "ô": 176, + "õ": 177, + "ö": 178, + "÷": 179, + "ø": 180, + "ù": 181, + "ú": 182, + "û": 183, + "ü": 184, + "ý": 185, + "þ": 186, + "ÿ": 187, + "Ā": 188, + "ā": 189, + "Ă": 190, + "ă": 191, + "Ą": 192, + "ą": 193, + "Ć": 194, + "ć": 195, + "Ĉ": 196, + "ĉ": 197, + "Ċ": 198, + "ċ": 199, + "Č": 200, + "č": 201, + "Ď": 202, + "ď": 203, + "Đ": 204, + "đ": 205, + "Ē": 206, + "ē": 207, + "Ĕ": 208, + "ĕ": 209, + "Ė": 210, + "ė": 211, + "Ę": 212, + "ę": 213, + "Ě": 214, + "ě": 215, + "Ĝ": 216, + "ĝ": 217, + "Ğ": 218, + "ğ": 219, + "Ġ": 220, + "ġ": 221, + "Ģ": 222, + "ģ": 223, + "Ĥ": 224, + "ĥ": 225, + "Ħ": 226, + "ħ": 227, + "Ĩ": 228, + "ĩ": 229, + "Ī": 230, + "ī": 231, + "Ĭ": 232, + "ĭ": 233, + "Į": 234, + "į": 235, + "İ": 236, + "ı": 237, + "IJ": 238, + "ij": 239, + "Ĵ": 240, + "ĵ": 241, + "Ķ": 242, + "ķ": 243, + "ĸ": 244, + "Ĺ": 245, + "ĺ": 246, + "Ļ": 247, + "ļ": 248, + "Ľ": 249, + "ľ": 250, + "Ŀ": 251, + "ŀ": 252, + "Ł": 253, + "ł": 254, + "Ń": 255, + "ĠĠ": 256, + "ĠĠĠĠ": 257, + "in": 258, + "Ġt": 259, + "ĠĠĠĠĠĠĠĠ": 260, + "er": 261, + "ĠĠĠ": 262, + "on": 263, + "Ġa": 264, + "re": 265, + "at": 266, + "st": 267, + "en": 268, + "or": 269, + "Ġth": 270, + "ĊĊ": 271, + "Ġc": 272, + "le": 273, + "Ġs": 274, + "it": 275, + "an": 276, + "ar": 277, + "al": 278, + "Ġthe": 279, + ";Ċ": 280, + "Ġp": 281, + "Ġf": 282, + "ou": 283, + "Ġ=": 284, + "is": 285, + "ĠĠĠĠĠĠĠ": 286, + "ing": 287, + "es": 288, + "Ġw": 289, + "ion": 290, + "ed": 291, + "ic": 292, + "Ġb": 293, + "Ġd": 294, + "et": 295, + "Ġm": 296, + "Ġo": 297, + "ĉĉ": 298, + "ro": 299, + "as": 300, + "el": 301, + "ct": 302, + "nd": 303, + "Ġin": 304, + "Ġh": 305, + "ent": 306, + "id": 307, + "Ġn": 308, + "am": 309, + "ĠĠĠĠĠĠĠĠĠĠĠ": 310, + "Ġto": 311, + "Ġre": 312, + "--": 313, + "Ġ{": 314, + "Ġof": 315, + "om": 316, + ");Ċ": 317, + "im": 318, + "čĊ": 319, + "Ġ(": 320, + "il": 321, + "//": 322, + "Ġand": 323, + "ur": 324, + "se": 325, + "Ġl": 326, + "ex": 327, + "ĠS": 328, + "ad": 329, + "Ġ\"": 330, + "ch": 331, + "ut": 332, + "if": 333, + "**": 334, + "Ġ}": 335, + "em": 336, + "ol": 337, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 338, + "th": 339, + ")Ċ": 340, + "Ġ{Ċ": 341, + "Ġg": 342, + "ig": 343, + "iv": 344, + ",Ċ": 345, + "ce": 346, + "od": 347, + "Ġv": 348, + "ate": 349, + "ĠT": 350, + "ag": 351, + "ay": 352, + "Ġ*": 353, + "ot": 354, + "us": 355, + "ĠC": 356, + "Ġst": 357, + "ĠI": 358, + "un": 359, + "ul": 360, + "ue": 361, + "ĠA": 362, + "ow": 363, + "Ġ'": 364, + "ew": 365, + "Ġ<": 366, + "ation": 367, + "()": 368, + "Ġfor": 369, + "ab": 370, + "ort": 371, + "um": 372, + "ame": 373, + "Ġis": 374, + "pe": 375, + "tr": 376, + "ck": 377, + "âĢ": 378, + "Ġy": 379, + "ist": 380, + "----": 381, + ".ĊĊ": 382, + "he": 383, + "Ġe": 384, + "lo": 385, + "ĠM": 386, + "Ġbe": 387, + "ers": 388, + "Ġon": 389, + "Ġcon": 390, + "ap": 391, + "ub": 392, + "ĠP": 393, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 394, + "ass": 395, + "int": 396, + ">Ċ": 397, + "ly": 398, + "urn": 399, + "Ġ$": 400, + ";ĊĊ": 401, + "av": 402, + "port": 403, + "ir": 404, + "->": 405, + "nt": 406, + "ction": 407, + "end": 408, + "Ġde": 409, + "00": 410, + "ith": 411, + "out": 412, + "turn": 413, + "our": 414, + "ĠĠĠĠĠ": 415, + "lic": 416, + "res": 417, + "pt": 418, + "==": 419, + "Ġthis": 420, + "Ġwh": 421, + "Ġif": 422, + "ĠD": 423, + "ver": 424, + "age": 425, + "ĠB": 426, + "ht": 427, + "ext": 428, + "=\"": 429, + "Ġthat": 430, + "****": 431, + "ĠR": 432, + "Ġit": 433, + "ess": 434, + "ĠF": 435, + "Ġr": 436, + "os": 437, + "and": 438, + "Ġas": 439, + "ect": 440, + "ke": 441, + "rom": 442, + "Ġ//": 443, + "con": 444, + "ĠL": 445, + "(\"": 446, + "qu": 447, + "lass": 448, + "Ġwith": 449, + "iz": 450, + "de": 451, + "ĠN": 452, + "Ġal": 453, + "op": 454, + "up": 455, + "get": 456, + "Ġ}Ċ": 457, + "ile": 458, + "Ġan": 459, + "ata": 460, + "ore": 461, + "ri": 462, + "Ġpro": 463, + ";čĊ": 464, + "ĉĉĉĉ": 465, + "ter": 466, + "ain": 467, + "ĠW": 468, + "ĠE": 469, + "Ġcom": 470, + "Ġreturn": 471, + "art": 472, + "ĠH": 473, + "ack": 474, + "import": 475, + "ublic": 476, + "Ġor": 477, + "est": 478, + "ment": 479, + "ĠG": 480, + "able": 481, + "Ġ-": 482, + "ine": 483, + "ill": 484, + "ind": 485, + "ere": 486, + "::": 487, + "ity": 488, + "Ġ+": 489, + "Ġtr": 490, + "elf": 491, + "ight": 492, + "('": 493, + "orm": 494, + "ult": 495, + "str": 496, + "..": 497, + "\",": 498, + "Ġyou": 499, + "ype": 500, + "pl": 501, + "Ġnew": 502, + "Ġj": 503, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 504, + "Ġfrom": 505, + "Ġex": 506, + "ĠO": 507, + "20": 508, + "ld": 509, + "Ġ[": 510, + "oc": 511, + ":Ċ": 512, + "Ġse": 513, + "Ġle": 514, + "--------": 515, + ".s": 516, + "{Ċ": 517, + "',": 518, + "ant": 519, + "Ġat": 520, + "ase": 521, + ".c": 522, + "Ġch": 523, + "": 524, + "ave": 525, + "ang": 526, + "Ġare": 527, + "Ġint": 528, + "âĢĻ": 529, + "_t": 530, + "ert": 531, + "ial": 532, + "act": 533, + "}Ċ": 534, + "ive": 535, + "ode": 536, + "ost": 537, + "Ġclass": 538, + "Ġnot": 539, + "og": 540, + "ord": 541, + "alue": 542, + "all": 543, + "ff": 544, + "();Ċ": 545, + "ont": 546, + "ime": 547, + "are": 548, + "ĠU": 549, + "Ġpr": 550, + "Ġ:": 551, + "ies": 552, + "ize": 553, + "ure": 554, + "Ġby": 555, + "ire": 556, + "Ġ}ĊĊ": 557, + ".p": 558, + "Ġsh": 559, + "ice": 560, + "ast": 561, + "ption": 562, + "tring": 563, + "ok": 564, + "__": 565, + "cl": 566, + "##": 567, + "Ġhe": 568, + "ard": 569, + ").": 570, + "Ġ@": 571, + "iew": 572, + "ĉĉĉ": 573, + "Ġwas": 574, + "ip": 575, + "this": 576, + "Ġu": 577, + "ĠThe": 578, + "ide": 579, + "ace": 580, + "ib": 581, + "ac": 582, + "rou": 583, + "Ġwe": 584, + "ject": 585, + "Ġpublic": 586, + "ak": 587, + "ve": 588, + "ath": 589, + "oid": 590, + "Ġ=>": 591, + "ust": 592, + "que": 593, + "Ġres": 594, + "))": 595, + "'s": 596, + "Ġk": 597, + "ans": 598, + "yst": 599, + "unction": 600, + "********": 601, + "Ġi": 602, + "Ġus": 603, + "pp": 604, + "10": 605, + "one": 606, + "ail": 607, + "====": 608, + "name": 609, + "Ġstr": 610, + "Ġ/": 611, + "Ġ&": 612, + "ach": 613, + "div": 614, + "ystem": 615, + "ell": 616, + "Ġhave": 617, + "err": 618, + "ould": 619, + "ull": 620, + "pon": 621, + "ĠJ": 622, + "_p": 623, + "Ġ==": 624, + "ign": 625, + "St": 626, + ".Ċ": 627, + "Ġpl": 628, + ");ĊĊ": 629, + "form": 630, + "put": 631, + "ount": 632, + "}ĊĊ": 633, + "dd": 634, + "ite": 635, + "Ġget": 636, + "rr": 637, + "ome": 638, + "ĠâĢ": 639, + "aram": 640, + "cc": 641, + "Ġ*/": 642, + "ER": 643, + "In": 644, + "les": 645, + "_s": 646, + "ong": 647, + "ie": 648, + "Ġcan": 649, + "ĠV": 650, + "erv": 651, + "pr": 652, + "Ġun": 653, + "row": 654, + "ber": 655, + "Ġdo": 656, + "ll": 657, + "Ġel": 658, + "Ġself": 659, + "ated": 660, + "ary": 661, + "Ġ.": 662, + "']": 663, + "ud": 664, + "Ġen": 665, + "ĠTh": 666, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 667, + "te": 668, + "_c": 669, + "uct": 670, + "Ġab": 671, + "ork": 672, + ".get": 673, + "Ġ#": 674, + "aw": 675, + "ress": 676, + "ob": 677, + "Name": 678, + "201": 679, + "app": 680, + "['": 681, + "Ġall": 682, + "ory": 683, + "ition": 684, + "ance": 685, + "ear": 686, + "Ġcont": 687, + "vent": 688, + "ia": 689, + "Ġwill": 690, + "IN": 691, + "ĠĠĠĠĠĠĠĠĠ": 692, + "return": 693, + "Ġ": 694, + "data": 695, + ")ĊĊ": 696, + "Re": 697, + "ple": 698, + "ild": 699, + "ther": 700, + "Ġyour": 701, + "\"Ċ": 702, + "($": 703, + "Ġout": 704, + "),": 705, + "Ġhas": 706, + "String": 707, + "so": 708, + "Ġup": 709, + "ax": 710, + "Ġdef": 711, + "Ġbo": 712, + "ge": 713, + "alse": 714, + "ON": 715, + "per": 716, + "12": 717, + "ich": 718, + "Ġbut": 719, + "ĠĊ": 720, + "Ġ_": 721, + "_m": 722, + "add": 723, + "quest": 724, + "odel": 725, + "self": 726, + "ery": 727, + "ft": 728, + "ens": 729, + "////": 730, + "ake": 731, + ".C": 732, + "Ġgo": 733, + "Ġfunction": 734, + "ĠK": 735, + "ivate": 736, + "Ġim": 737, + "Ġconst": 738, + ".t": 739, + "Ġ*/Ċ": 740, + ");čĊ": 741, + "Ġvoid": 742, + "Ġset": 743, + "ĠSystem": 744, + "cri": 745, + "()Ċ": 746, + "li": 747, + "ĉif": 748, + ".m": 749, + "ally": 750, + "set": 751, + "ep": 752, + "âĢĻs": 753, + "bo": 754, + "def": 755, + "',Ċ": 756, + "Ġme": 757, + "Ġ!": 758, + "atch": 759, + "\">": 760, + "\",Ċ": 761, + "ec": 762, + "ĠIn": 763, + "ph": 764, + "Ġ|": 765, + "_f": 766, + "Ġvar": 767, + "ence": 768, + "Id": 769, + "ree": 770, + "ink": 771, + "lect": 772, + "ug": 773, + "eth": 774, + "Ġelse": 775, + "----------------": 776, + "19": 777, + "cont": 778, + "Ġso": 779, + "atic": 780, + "Ġlo": 781, + "pro": 782, + "ton": 783, + "ss": 784, + "own": 785, + "abel": 786, + "oint": 787, + "ous": 788, + "eld": 789, + "ST": 790, + "The": 791, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 792, + "RE": 793, + "\":": 794, + "olor": 795, + "tp": 796, + "eg": 797, + "key": 798, + "ude": 799, + "ĠSt": 800, + "ound": 801, + "Ġar": 802, + "\");Ċ": 803, + "ener": 804, + "ser": 805, + "11": 806, + "bject": 807, + "essage": 808, + "fer": 809, + "Ġmore": 810, + "ations": 811, + "ents": 812, + "Ġhis": 813, + "Ġthey": 814, + ".S": 815, + "ĠY": 816, + "use": 817, + "ne": 818, + "ish": 819, + "old": 820, + "_d": 821, + "io": 822, + "ield": 823, + "Ġper": 824, + "Cont": 825, + "ings": 826, + "####": 827, + "Ġdata": 828, + "Ġsa": 829, + "ef": 830, + "fo": 831, + "Ġone": 832, + "eng": 833, + "Ġdis": 834, + "AT": 835, + "Ġname": 836, + "Ġtrue": 837, + "val": 838, + "led": 839, + ".f": 840, + "Ġne": 841, + "Ġend": 842, + "32": 843, + ".T": 844, + "16": 845, + "cre": 846, + "ark": 847, + "log": 848, + "Ex": 849, + "error": 850, + "_id": 851, + "urre": 852, + "ange": 853, + "Ġnull": 854, + "rray": 855, + "Ġmy": 856, + "pan": 857, + "ict": 858, + "ator": 859, + "View": 860, + "List": 861, + "ĉreturn": 862, + "âĢĿ": 863, + "Ġpre": 864, + "Ġx": 865, + "clude": 866, + "arg": 867, + "15": 868, + "ov": 869, + ".h": 870, + "Ġ>": 871, + "Ġtheir": 872, + "')": 873, + "irst": 874, + "ick": 875, + "gh": 876, + "LE": 877, + "OR": 878, + "Ġprivate": 879, + "tem": 880, + "čĊčĊ": 881, + "user": 882, + "Ġ)": 883, + "com": 884, + ".A": 885, + "\";Ċ": 886, + "Ġid": 887, + "read": 888, + "Ġwho": 889, + "_b": 890, + "\">Ċ": 891, + "Ġtime": 892, + "Ġman": 893, + "ry": 894, + "========": 895, + "roup": 896, + "rop": 897, + "public": 898, + "vel": 899, + "umber": 900, + "ble": 901, + "Ġwhich": 902, + "****************": 903, + "Ġany": 904, + "Ġfalse": 905, + "we": 906, + "Ġvalue": 907, + "Ġli": 908, + "\")": 909, + "nder": 910, + "gr": 911, + "Ġno": 912, + "param": 913, + "25": 914, + "fig": 915, + ".com": 916, + "Ġapp": 917, + "_l": 918, + "ions": 919, + ".D": 920, + "ĠCh": 921, + "Ġabout": 922, + "Ġadd": 923, + "Ġsu": 924, + "Ġstring": 925, + "ID": 926, + "Ġover": 927, + "string": 928, + ".l": 929, + "ource": 930, + "000": 931, + "_C": 932, + "]Ċ": 933, + "Ġqu": 934, + "ĠString": 935, + "ca": 936, + "SE": 937, + "Ġro": 938, + "sh": 939, + "ual": 940, + "Type": 941, + "son": 942, + "new": 943, + "ern": 944, + "Ġag": 945, + "AR": 946, + "];Ċ": 947, + "].": 948, + "Ġ?": 949, + "ical": 950, + "Ġdes": 951, + "uth": 952, + "ix": 953, + "ays": 954, + "Ġtype": 955, + "'t": 956, + "ault": 957, + "Ġinter": 958, + "var": 959, + ".b": 960, + "Ġpart": 961, + ".d": 962, + "urrent": 963, + "IT": 964, + "EN": 965, + "30": 966, + "enc": 967, + "(f": 968, + "ra": 969, + "value": 970, + "cho": 971, + "18": 972, + "utton": 973, + "ose": 974, + "14": 975, + "Ġ!=": 976, + "ater": 977, + "é": 978, + "reate": 979, + "oll": 980, + "pos": 981, + "yle": 982, + "ng": 983, + "AL": 984, + "using": 985, + "ames": 986, + "Ġ{čĊ": 987, + "ates": 988, + "ely": 989, + "Ġwork": 990, + "Ġem": 991, + "inal": 992, + "Ġsp": 993, + "Ġwhen": 994, + ".set": 995, + "ĠĠĠĠĠĠ": 996, + "):Ċ": 997, + "to": 998, + "quire": 999, + "indow": 1000, + "lement": 1001, + "pect": 1002, + "ash": 1003, + "[i": 1004, + "Ġuse": 1005, + ".F": 1006, + "pec": 1007, + "Ġad": 1008, + "ove": 1009, + "ception": 1010, + "ength": 1011, + "include": 1012, + "ader": 1013, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1014, + "atus": 1015, + "Th": 1016, + "itle": 1017, + "rit": 1018, + "void": 1019, + "().": 1020, + "(Ċ": 1021, + "Ġoff": 1022, + "Ġother": 1023, + "Ġ&&": 1024, + "';Ċ": 1025, + "ms": 1026, + "Ġbeen": 1027, + "Ġte": 1028, + "ml": 1029, + "co": 1030, + "nc": 1031, + "13": 1032, + "ervice": 1033, + "Ġ%": 1034, + "**Ċ": 1035, + "ann": 1036, + "ade": 1037, + "ĊĊĊĊ": 1038, + "lock": 1039, + "const": 1040, + "100": 1041, + "ponse": 1042, + "Ġsup": 1043, + "++": 1044, + "date": 1045, + "Ġacc": 1046, + "Ġhad": 1047, + "Ġbu": 1048, + "200": 1049, + "ĠRe": 1050, + "Ġwere": 1051, + "Ġfile": 1052, + "Ġwould": 1053, + "ĠâĢľ": 1054, + "ven": 1055, + "iss": 1056, + "Ġour": 1057, + "class": 1058, + "raw": 1059, + "Ġyear": 1060, + "Data": 1061, + "Ġval": 1062, + "Ġsome": 1063, + "fter": 1064, + "ys": 1065, + "Ġ///": 1066, + "round": 1067, + "view": 1068, + "Ġpe": 1069, + "Ġthere": 1070, + "Ġsaid": 1071, + "du": 1072, + "of": 1073, + "line": 1074, + "/*": 1075, + "duct": 1076, + "Ġher": 1077, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1078, + "Res": 1079, + "Ġco": 1080, + "Ġcomm": 1081, + "ise": 1082, + "min": 1083, + "ĠĠĠĠĊ": 1084, + "#include": 1085, + "ethod": 1086, + ".P": 1087, + "ute": 1088, + "Ġass": 1089, + "Int": 1090, + "ask": 1091, + "loc": 1092, + "Ġlike": 1093, + "ody": 1094, + "Ġlet": 1095, + "load": 1096, + "Ġam": 1097, + "rol": 1098, + "Ġgr": 1099, + "yp": 1100, + "Ġalso": 1101, + "ĠIt": 1102, + "url": 1103, + "ific": 1104, + "ors": 1105, + "_P": 1106, + "_n": 1107, + "igh": 1108, + "Ġthan": 1109, + "Com": 1110, + "AN": 1111, + "UL": 1112, + "ating": 1113, + "17": 1114, + "ĠThis": 1115, + "ref": 1116, + "_S": 1117, + "Ġstatic": 1118, + "roll": 1119, + "Ġjust": 1120, + "Ġresult": 1121, + "ian": 1122, + "idth": 1123, + "Ġthem": 1124, + "));Ċ": 1125, + "der": 1126, + "reak": 1127, + "Con": 1128, + "://": 1129, + "ule": 1130, + "...": 1131, + "arch": 1132, + "ement": 1133, + "Ġ<<": 1134, + "50": 1135, + "ush": 1136, + "ense": 1137, + "arr": 1138, + "Ġinto": 1139, + "cess": 1140, + "amp": 1141, + "ied": 1142, + "ument": 1143, + "Ġ\\": 1144, + "],": 1145, + "wo": 1146, + "als": 1147, + "Ġwhat": 1148, + "anc": 1149, + "Value": 1150, + "='": 1151, + "olum": 1152, + "Ġpos": 1153, + "ages": 1154, + "ayer": 1155, + "Ġsc": 1156, + "ues": 1157, + "\")Ċ": 1158, + "_T": 1159, + "Ġlist": 1160, + "(s": 1161, + "Ġcase": 1162, + "Ch": 1163, + "ĉĉĉĉĉ": 1164, + "////////": 1165, + "ponent": 1166, + "Ġz": 1167, + "Ġkn": 1168, + "let": 1169, + "DE": 1170, + "red": 1171, + "Ġfe": 1172, + "Ġ},Ċ": 1173, + "Ġ,": 1174, + "(t": 1175, + "Ġfirst": 1176, + "');Ċ": 1177, + "word": 1178, + "Ġimport": 1179, + "Ġact": 1180, + "Ġchar": 1181, + "CT": 1182, + "ĠTr": 1183, + "ople": 1184, + "={": 1185, + "ĉf": 1186, + "24": 1187, + "ient": 1188, + "cent": 1189, + ".j": 1190, + "lection": 1191, + "))Ċ": 1192, + "Ġonly": 1193, + "Ġprint": 1194, + "mer": 1195, + ".W": 1196, + "ock": 1197, + "Ġ--": 1198, + "Text": 1199, + "Ġop": 1200, + "ank": 1201, + "Ġits": 1202, + "Ġback": 1203, + "[\"": 1204, + "Ġneed": 1205, + "Ġcl": 1206, + "Ġsub": 1207, + "Ġla": 1208, + "((": 1209, + ".\"": 1210, + "Object": 1211, + "Ġstart": 1212, + "file": 1213, + "(self": 1214, + "ner": 1215, + "ey": 1216, + "Ġuser": 1217, + "Ġent": 1218, + "ĠCom": 1219, + "its": 1220, + "ĠCon": 1221, + "ouble": 1222, + "ower": 1223, + "item": 1224, + "very": 1225, + "ĠWe": 1226, + "64": 1227, + "lick": 1228, + "ĠQ": 1229, + "php": 1230, + "ttp": 1231, + "':": 1232, + "ics": 1233, + "Ġunder": 1234, + "Ġ*Ċ": 1235, + ".L": 1236, + ");": 1237, + "ices": 1238, + "Ġreg": 1239, + ")čĊ": 1240, + "ĉpublic": 1241, + "SS": 1242, + "Ġthen": 1243, + "reat": 1244, + "ious": 1245, + ".G": 1246, + "ek": 1247, + "irect": 1248, + "heck": 1249, + "cript": 1250, + "ning": 1251, + "ĠUn": 1252, + "Ġmay": 1253, + "ĠWh": 1254, + "Bo": 1255, + "Item": 1256, + "struct": 1257, + ".st": 1258, + "ream": 1259, + "ible": 1260, + "loat": 1261, + "Ġorg": 1262, + "und": 1263, + "sum": 1264, + "_in": 1265, + "../": 1266, + "_M": 1267, + "Ġhow": 1268, + "rite": 1269, + "'Ċ": 1270, + "To": 1271, + "40": 1272, + "ww": 1273, + "Ġpeople": 1274, + "index": 1275, + ".n": 1276, + "http": 1277, + "(m": 1278, + "ector": 1279, + "Ġind": 1280, + "Ġjav": 1281, + "],Ċ": 1282, + "ĠHe": 1283, + "_st": 1284, + "ful": 1285, + "ole": 1286, + "){Ċ": 1287, + "Ġshould": 1288, + "opy": 1289, + "elp": 1290, + "ier": 1291, + "_name": 1292, + "erson": 1293, + "ION": 1294, + "ote": 1295, + "Ġtest": 1296, + "Ġbet": 1297, + "rror": 1298, + "ular": 1299, + "ãĢ": 1300, + "ĠÐ": 1301, + "bs": 1302, + "ting": 1303, + "Ġmake": 1304, + "Tr": 1305, + "Ġafter": 1306, + "arget": 1307, + "RO": 1308, + "olumn": 1309, + "rc": 1310, + "_re": 1311, + "define": 1312, + "22": 1313, + "Ġright": 1314, + "right": 1315, + "day": 1316, + "Ġlong": 1317, + "[]": 1318, + "(p": 1319, + "td": 1320, + "cond": 1321, + "ĠPro": 1322, + "Ġrem": 1323, + "ptions": 1324, + "vid": 1325, + ".g": 1326, + "Ġext": 1327, + "Ġ__": 1328, + "')Ċ": 1329, + "pace": 1330, + "mp": 1331, + "Ġmin": 1332, + "stance": 1333, + "air": 1334, + "action": 1335, + "wh": 1336, + "type": 1337, + "util": 1338, + "ait": 1339, + "": 1340, + "IC": 1341, + "text": 1342, + "Ġph": 1343, + "Ġfl": 1344, + ".M": 1345, + "ccess": 1346, + "br": 1347, + "fore": 1348, + "ersion": 1349, + "),Ċ": 1350, + ".re": 1351, + "ateg": 1352, + "Ġloc": 1353, + "ins": 1354, + "-s": 1355, + "trib": 1356, + "ĠInt": 1357, + "Ġarray": 1358, + ",\"": 1359, + "Pro": 1360, + "(c": 1361, + "ession": 1362, + ">ĊĊ": 1363, + "Ġshe": 1364, + "\"]": 1365, + "aph": 1366, + "Ġexp": 1367, + "erty": 1368, + "ĠSe": 1369, + "Ġpar": 1370, + "unc": 1371, + "ET": 1372, + "Ġread": 1373, + "print": 1374, + "Ġrel": 1375, + "Ġform": 1376, + "Ġdr": 1377, + "Exception": 1378, + "input": 1379, + "Ġtrans": 1380, + "########": 1381, + "order": 1382, + "By": 1383, + "Ġaw": 1384, + "ities": 1385, + "uff": 1386, + "play": 1387, + ".add": 1388, + "ĠâĢĵ": 1389, + "Ġwant": 1390, + "Ġcomp": 1391, + "ments": 1392, + "Ġ||": 1393, + "az": 1394, + "be": 1395, + "Ġnumber": 1396, + "Ġrequire": 1397, + "ĠEx": 1398, + "60": 1399, + "Ġcol": 1400, + "Ġkey": 1401, + "ember": 1402, + "Ġtwo": 1403, + "Ġsize": 1404, + "Ġwhere": 1405, + "UT": 1406, + "result": 1407, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1408, + "ough": 1409, + "orld": 1410, + "ood": 1411, + "uch": 1412, + "ative": 1413, + "ger": 1414, + "arent": 1415, + "Ġ/*": 1416, + "Ġarg": 1417, + "Ġwhile": 1418, + "23": 1419, + "(this": 1420, + "Ġrec": 1421, + "Ġdif": 1422, + "State": 1423, + "Ġspec": 1424, + "ride": 1425, + "_F": 1426, + "Ġlook": 1427, + "AM": 1428, + "ility": 1429, + "eter": 1430, + "âĢĻt": 1431, + "ĊĊĊ": 1432, + "ayout": 1433, + "--------------------------------": 1434, + "ager": 1435, + "Ġcould": 1436, + "Ġbr": 1437, + "ends": 1438, + "ures": 1439, + "Ġknow": 1440, + "ets": 1441, + "ĠIf": 1442, + "ĠSh": 1443, + ".w": 1444, + "back": 1445, + "Ġser": 1446, + "Ġ+=": 1447, + "Ġfr": 1448, + "());Ċ": 1449, + "Ġhand": 1450, + "Ind": 1451, + "ULL": 1452, + "Im": 1453, + "();ĊĊ": 1454, + "Ġmost": 1455, + "Ġtry": 1456, + "Ġnow": 1457, + "rough": 1458, + ">čĊ": 1459, + "ackage": 1460, + "Ġhim": 1461, + "._": 1462, + "ify": 1463, + "Ġbreak": 1464, + "Ġ);Ċ": 1465, + "ren": 1466, + "#define": 1467, + "itt": 1468, + "Ġap": 1469, + "ĉc": 1470, + "(n": 1471, + "ĠYou": 1472, + ":ĊĊ": 1473, + "-m": 1474, + "Ġevery": 1475, + "ustom": 1476, + "lient": 1477, + "ocument": 1478, + "cription": 1479, + "Error": 1480, + "-b": 1481, + "о": 1482, + "][": 1483, + "99": 1484, + "trans": 1485, + "Ġpoint": 1486, + "Ġstd": 1487, + "Ġfil": 1488, + "Time": 1489, + "80": 1490, + "Ġmod": 1491, + "Ġ->": 1492, + "Ġerror": 1493, + "ah": 1494, + "Ġtext": 1495, + "roller": 1496, + "lose": 1497, + "ql": 1498, + "Ġpol": 1499, + ">": 1500, + "Ġshow": 1501, + "User": 1502, + "ased": 1503, + "Ġ{ĊĊ": 1504, + "Ġfind": 1505, + "а": 1506, + "ED": 1507, + "span": 1508, + "enu": 1509, + "Ġcurrent": 1510, + "Ġused": 1511, + "cept": 1512, + "clud": 1513, + "Ġplay": 1514, + "Ġlog": 1515, + "ution": 1516, + "fl": 1517, + "Ġsee": 1518, + "indows": 1519, + "Ġhelp": 1520, + "Ġthese": 1521, + "Ġpass": 1522, + "Ġdown": 1523, + "Ġeven": 1524, + "ason": 1525, + "uild": 1526, + "from": 1527, + "(d": 1528, + "Ġbl": 1529, + "label": 1530, + "else": 1531, + "е": 1532, + "Ġ(!": 1533, + "ized": 1534, + "(),": 1535, + "Ġob": 1536, + "Ġitem": 1537, + "ump": 1538, + "UR": 1539, + "orn": 1540, + "Ġdon": 1541, + "Se": 1542, + "man": 1543, + "27": 1544, + "ample": 1545, + "tn": 1546, + "================": 1547, + "He": 1548, + "gram": 1549, + "Ġdid": 1550, + "wn": 1551, + "_h": 1552, + "iver": 1553, + "Ġsm": 1554, + "Ġthrough": 1555, + "ĠAn": 1556, + "che": 1557, + "Ġinv": 1558, + "ouse": 1559, + "Ġes": 1560, + "ĠNew": 1561, + "export": 1562, + "mary": 1563, + "uto": 1564, + "ler": 1565, + "Ġlast": 1566, + "Ġevent": 1567, + "try": 1568, + "ï¼": 1569, + "ily": 1570, + "igned": 1571, + "ines": 1572, + "ollow": 1573, + "icense": 1574, + "sole": 1575, + "lear": 1576, + "(int": 1577, + "Ġagain": 1578, + "Ġhigh": 1579, + "html": 1580, + "Index": 1581, + "uthor": 1582, + "Ġ/**Ċ": 1583, + "Ġline": 1584, + "Event": 1585, + "_D": 1586, + "Ġdoes": 1587, + "itial": 1588, + "Ġcr": 1589, + "ars": 1590, + "28": 1591, + "Ġtem": 1592, + "cause": 1593, + "face": 1594, + "Ġ`": 1595, + "_A": 1596, + "Button": 1597, + "ature": 1598, + "ected": 1599, + "ES": 1600, + "ister": 1601, + "ĉĊ": 1602, + "Ġbefore": 1603, + "ale": 1604, + "other": 1605, + "Ġbecause": 1606, + "roid": 1607, + "Ġed": 1608, + "ik": 1609, + "reg": 1610, + "ĠDe": 1611, + "Ġdist": 1612, + "},Ċ": 1613, + "Ġstate": 1614, + "Ġcons": 1615, + "rint": 1616, + "att": 1617, + "Ġhere": 1618, + "ined": 1619, + "Ġfinal": 1620, + "Ġ\"\"": 1621, + "Key": 1622, + "LO": 1623, + "Ġdel": 1624, + "pty": 1625, + "thing": 1626, + "26": 1627, + "ĠAnd": 1628, + "Ġrun": 1629, + "ĠX": 1630, + "ym": 1631, + ".app": 1632, + "Ġvery": 1633, + "ces": 1634, + "_N": 1635, + "ared": 1636, + "ward": 1637, + "list": 1638, + "ited": 1639, + "olog": 1640, + "itch": 1641, + "Box": 1642, + "ife": 1643, + "33": 1644, + "Ġac": 1645, + "Ġmodel": 1646, + "Ġmon": 1647, + "Ġway": 1648, + "lete": 1649, + "Ġcall": 1650, + "Ġatt": 1651, + "Ġcal": 1652, + "vert": 1653, + "Ġdec": 1654, + "lease": 1655, + "oun": 1656, + "Ġ});Ċ": 1657, + "fr": 1658, + "formation": 1659, + "etail": 1660, + "Ġnum": 1661, + "aj": 1662, + "query": 1663, + "Ġwell": 1664, + "Ġobject": 1665, + "ĠAs": 1666, + "Ġyears": 1667, + "Color": 1668, + "IS": 1669, + "Ġdefault": 1670, + "Wh": 1671, + "Ġins": 1672, + "aint": 1673, + "Ġjava": 1674, + "Ġsim": 1675, + "ĠAr": 1676, + "mon": 1677, + "til": 1678, + "();čĊ": 1679, + "):": 1680, + "Set": 1681, + "29": 1682, + "atter": 1683, + "Ġview": 1684, + "Ġpres": 1685, + "array": 1686, + "We": 1687, + "At": 1688, + "Ġbel": 1689, + "Ġmany": 1690, + "21": 1691, + "Man": 1692, + "ender": 1693, + "Ġbeing": 1694, + "Ġgood": 1695, + "ĉĉĉĉĉĉ": 1696, + "ational": 1697, + "ware": 1698, + ".log": 1699, + "{čĊ": 1700, + "Ġusing": 1701, + "_B": 1702, + "Ġ:=": 1703, + "_w": 1704, + "ists": 1705, + "lish": 1706, + "Ġstud": 1707, + "ĠAl": 1708, + "Ġgu": 1709, + "config": 1710, + "uring": 1711, + "time": 1712, + "oken": 1713, + "amespace": 1714, + "Ġrequest": 1715, + "Ġchild": 1716, + "ĠÃ": 1717, + "lob": 1718, + "Ġparam": 1719, + "Ġ}čĊ": 1720, + "01": 1721, + "Ġecho": 1722, + "function": 1723, + "********************************": 1724, + "ps": 1725, + "Element": 1726, + "alk": 1727, + "lication": 1728, + "by": 1729, + "Size": 1730, + "rawing": 1731, + "Ġperson": 1732, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1733, + "\\n": 1734, + "object": 1735, + "ince": 1736, + "En": 1737, + "File": 1738, + "uf": 1739, + "ffect": 1740, + "AC": 1741, + "Ġstyle": 1742, + "summary": 1743, + "Ġque": 1744, + "_r": 1745, + "Ġ($": 1746, + "Model": 1747, + "ident": 1748, + "Ġmethod": 1749, + "IL": 1750, + "ott": 1751, + "less": 1752, + "ING": 1753, + "Ġ()": 1754, + "Ġexpect": 1755, + "ync": 1756, + "package": 1757, + "35": 1758, + "urs": 1759, + "Ġprot": 1760, + "./": 1761, + "pre": 1762, + "Ġ)Ċ": 1763, + "ma": 1764, + "Ġsur": 1765, + "Ġfound": 1766, + "Info": 1767, + "par": 1768, + "imes": 1769, + ".e": 1770, + "ains": 1771, + "Ġpost": 1772, + "-d": 1773, + "45": 1774, + "olean": 1775, + "Ġsl": 1776, + "PE": 1777, + "Ġsuch": 1778, + "select": 1779, + "ainer": 1780, + "Ġthink": 1781, + "Ġdiffer": 1782, + ".r": 1783, + "/**Ċ": 1784, + "FF": 1785, + "ool": 1786, + "plate": 1787, + "qual": 1788, + "ĠFor": 1789, + "Ġmuch": 1790, + "uc": 1791, + "(new": 1792, + "odule": 1793, + "Ġsom": 1794, + "Ġhttp": 1795, + "ĠList": 1796, + "Ġcount": 1797, + "Ġinst": 1798, + "char": 1799, + "mit": 1800, + ".id": 1801, + "aking": 1802, + "Ġgener": 1803, + "px": 1804, + "vice": 1805, + "37": 1806, + "_data": 1807, + "ĠNULL": 1808, + "}čĊ": 1809, + "idd": 1810, + "ãĢĤ": 1811, + "Ġmed": 1812, + "org": 1813, + "ider": 1814, + "ache": 1815, + "work": 1816, + "Ġcheck": 1817, + "ween": 1818, + "Ġ((": 1819, + "the": 1820, + "ants": 1821, + "><": 1822, + ".B": 1823, + "-c": 1824, + "Ġopen": 1825, + "Ġest": 1826, + "ĠĠĠĠĠĠĠĠĊ": 1827, + "Ġnext": 1828, + "IM": 1829, + "ÑĤ": 1830, + "OT": 1831, + "ó": 1832, + "Ġfollow": 1833, + "content": 1834, + "ĠĠĠĠĠĠĠĠĠĠĠĠ": 1835, + "Ġinclud": 1836, + "HE": 1837, + "ĠRes": 1838, + "Ġhref": 1839, + "и": 1840, + "Ġcar": 1841, + "ypes": 1842, + "image": 1843, + "Un": 1844, + "Ġbool": 1845, + "AD": 1846, + "Ġgame": 1847, + ".Form": 1848, + "rows": 1849, + "*/": 1850, + "velop": 1851, + ".Drawing": 1852, + "Ġpath": 1853, + "ision": 1854, + "Ġeach": 1855, + "ĠPl": 1856, + "_type": 1857, + "Path": 1858, + "nection": 1859, + "Ġav": 1860, + "').": 1861, + "Ġsupport": 1862, + "ENT": 1863, + "rem": 1864, + "\").": 1865, + "Ġown": 1866, + "Ġcor": 1867, + "count": 1868, + "miss": 1869, + "ually": 1870, + "Ġmem": 1871, + "std": 1872, + "ience": 1873, + "search": 1874, + "\"ĊĊ": 1875, + "Form": 1876, + "Ġsex": 1877, + "ename": 1878, + "Ġsign": 1879, + "Ġet": 1880, + "ĠĠĠĠĠĠĠĠĠĠ": 1881, + "','": 1882, + "ĠApp": 1883, + "Ġthose": 1884, + "off": 1885, + "Ġerr": 1886, + "Ġsystem": 1887, + "Ġbest": 1888, + "code": 1889, + "Ġsame": 1890, + "Ġdi": 1891, + "uss": 1892, + "Ġcreate": 1893, + "ather": 1894, + "Array": 1895, + ".in": 1896, + "fe": 1897, + "Service": 1898, + "UN": 1899, + "ats": 1900, + "ĠZ": 1901, + "alth": 1902, + "Ġmade": 1903, + "true": 1904, + "AB": 1905, + "Ġmark": 1906, + "rid": 1907, + "ified": 1908, + ",čĊ": 1909, + "yn": 1910, + "press": 1911, + "Ġgroup": 1912, + "Ġfin": 1913, + "ĠLicense": 1914, + "Field": 1915, + "eger": 1916, + "Ġworld": 1917, + "iness": 1918, + "ty": 1919, + "Ġprocess": 1920, + "(b": 1921, + "Ġcre": 1922, + "arn": 1923, + "ives": 1924, + "Ġmain": 1925, + "ideo": 1926, + "36": 1927, + "_g": 1928, + "AG": 1929, + "valid": 1930, + "img": 1931, + "PI": 1932, + "Ġcolor": 1933, + "Ġreport": 1934, + "Ġtake": 1935, + "rib": 1936, + "OM": 1937, + "Ġday": 1938, + "Request": 1939, + "Ġsk": 1940, + "bers": 1941, + "ĉs": 1942, + ".Add": 1943, + "oot": 1944, + "Image": 1945, + "Ġcomple": 1946, + "ollection": 1947, + "Ġtop": 1948, + "Ġfree": 1949, + "AS": 1950, + "De": 1951, + "ĠOn": 1952, + "IG": 1953, + "90": 1954, + "eta": 1955, + "Date": 1956, + "Ġaction": 1957, + "34": 1958, + "Over": 1959, + "itor": 1960, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1961, + "not": 1962, + "Ġindex": 1963, + "her": 1964, + "icon": 1965, + "On": 1966, + ";čĊčĊ": 1967, + "ivity": 1968, + "mand": 1969, + ".Windows": 1970, + "OL": 1971, + "Ġreal": 1972, + "Ġmax": 1973, + "land": 1974, + "....": 1975, + "raph": 1976, + "Ġbuild": 1977, + "leg": 1978, + "assword": 1979, + "?ĊĊ": 1980, + "â̦": 1981, + "ook": 1982, + "uck": 1983, + "Ġmessage": 1984, + "test": 1985, + "ivers": 1986, + "38": 1987, + "Ġinput": 1988, + "Ġart": 1989, + "Ġbetween": 1990, + "Get": 1991, + "enter": 1992, + "ground": 1993, + "ene": 1994, + "á": 1995, + ".length": 1996, + "Node": 1997, + "(i": 1998, + "Class": 1999, + "for": 2000, + "ĠâĢĶ": 2001, + "ten": 2002, + "oin": 2003, + "Ġke": 2004, + "ui": 2005, + "ĠIN": 2006, + "Ġtable": 2007, + "sub": 2008, + "ĠLe": 2009, + "Ġhead": 2010, + "Ġmust": 2011, + "////////////////": 2012, + ".util": 2013, + "Context": 2014, + "Ġorder": 2015, + "Ġmov": 2016, + "over": 2017, + "Ġcontin": 2018, + "Ġsay": 2019, + "static": 2020, + ".Text": 2021, + "ĠclassName": 2022, + "pany": 2023, + "Ġter": 2024, + "head": 2025, + "rg": 2026, + "Ġproduct": 2027, + "This": 2028, + ".âĢĿ": 2029, + "ĠBut": 2030, + "70": 2031, + "loy": 2032, + "Ġdouble": 2033, + "sg": 2034, + "Ġplace": 2035, + ".x": 2036, + "message": 2037, + "Ġinformation": 2038, + "private": 2039, + "Ġoper": 2040, + "ced": 2041, + "db": 2042, + "\">": 2043, + "Param": 2044, + "icle": 2045, + "Ġweek": 2046, + "Ġprop": 2047, + "table": 2048, + "idget": 2049, + "place": 2050, + "Prop": 2051, + "ĠAll": 2052, + "els": 2053, + "box": 2054, + ".ĊĊĊĊ": 2055, + ".R": 2056, + "ĠTo": 2057, + "iter": 2058, + "Sh": 2059, + "uration": 2060, + "older": 2061, + "_list": 2062, + "come": 2063, + "Ġsw": 2064, + "ization": 2065, + "ĉfor": 2066, + "bl": 2067, + "Ġprogram": 2068, + "(e": 2069, + "ape": 2070, + "check": 2071, + ".Forms": 2072, + "Ġund": 2073, + "ategory": 2074, + "75": 2075, + "ags": 2076, + "Ġresponse": 2077, + "US": 2078, + "request": 2079, + "Ġstruct": 2080, + "escription": 2081, + "Ġcode": 2082, + "_H": 2083, + "uffer": 2084, + "Ġwithout": 2085, + "lobal": 2086, + "Manager": 2087, + "ilter": 2088, + "PO": 2089, + "ĉthis": 2090, + "option": 2091, + "Ġsol": 2092, + "Ġ===": 2093, + "akes": 2094, + "Controller": 2095, + "44": 2096, + "Message": 2097, + "Ġref": 2098, + "ever": 2099, + "ĠSo": 2100, + "aining": 2101, + ".append": 2102, + "Ġstill": 2103, + "Ġprovid": 2104, + "Ġassert": 2105, + "med": 2106, + "Ġcap": 2107, + "usiness": 2108, + "Ġrep": 2109, + "tings": 2110, + "ved": 2111, + ".N": 2112, + "api": 2113, + "OD": 2114, + "Ġfield": 2115, + "iven": 2116, + "oto": 2117, + "âĢľ": 2118, + "col": 2119, + "(x": 2120, + "ght": 2121, + "Result": 2122, + "Code": 2123, + ".is": 2124, + "link": 2125, + "Ġcour": 2126, + "An": 2127, + "Ġteam": 2128, + "ĉint": 2129, + "ift": 2130, + "55": 2131, + "Ġsecond": 2132, + "Ġgoing": 2133, + "Ġrange": 2134, + "_E": 2135, + "ness": 2136, + "39": 2137, + "Ġfam": 2138, + "Ġnil": 2139, + "ĠCont": 2140, + "ailable": 2141, + "utes": 2142, + "atab": 2143, + "Ġfact": 2144, + "Ġvis": 2145, + "(&": 2146, + "ĠAN": 2147, + "31": 2148, + "Al": 2149, + "title": 2150, + "Ġandroid": 2151, + "CE": 2152, + "\\\"": 2153, + "irt": 2154, + "Ġwrit": 2155, + "н": 2156, + "ĉm": 2157, + "ftware": 2158, + "ond": 2159, + "Ġret": 2160, + "osition": 2161, + "Ġhome": 2162, + "Ġleft": 2163, + "args": 2164, + "meric": 2165, + "48": 2166, + "Ġdirect": 2167, + "oci": 2168, + "Pl": 2169, + "As": 2170, + "ret": 2171, + "ado": 2172, + "Of": 2173, + "chn": 2174, + "ĠGet": 2175, + "ee": 2176, + "ross": 2177, + "();": 2178, + "____": 2179, + ".ph": 2180, + "It": 2181, + "oute": 2182, + "Ġexper": 2183, + "chool": 2184, + "www": 2185, + "},": 2186, + "Ġallow": 2187, + "ĠÂ": 2188, + "())": 2189, + "size": 2190, + "ism": 2191, + "ai": 2192, + "tract": 2193, + "ane": 2194, + "...ĊĊ": 2195, + "context": 2196, + "Ġbeg": 2197, + "CH": 2198, + "Ġpage": 2199, + "hip": 2200, + "no": 2201, + "core": 2202, + "sp": 2203, + "Ġdifferent": 2204, + "iable": 2205, + "ĠMe": 2206, + "_IN": 2207, + "button": 2208, + "ĠIs": 2209, + "ervices": 2210, + "Ġca": 2211, + "Ġaround": 2212, + "App": 2213, + "ration": 2214, + "Ġrece": 2215, + "Ġreally": 2216, + "Ġimage": 2217, + "Ġtarget": 2218, + "Ġdep": 2219, + "opyright": 2220, + "tra": 2221, + "ingle": 2222, + "ital": 2223, + "Layout": 2224, + "Ġboth": 2225, + "Override": 2226, + "arm": 2227, + "=>": 2228, + "aterial": 2229, + "iled": 2230, + "Ġput": 2231, + "Qu": 2232, + "ÑĢ": 2233, + "ung": 2234, + "map": 2235, + "ĉĉĉĉĉĉĉĉ": 2236, + "Ġlevel": 2237, + "Component": 2238, + "book": 2239, + "creen": 2240, + "_RE": 2241, + "Ġconfig": 2242, + "ãģ": 2243, + "Or": 2244, + ".data": 2245, + "Ġdocument": 2246, + "\",\"": 2247, + "tribute": 2248, + "ux": 2249, + "Log": 2250, + "ference": 2251, + "post": 2252, + "_e": 2253, + "Ġlocal": 2254, + "andom": 2255, + "assert": 2256, + "Val": 2257, + "lected": 2258, + "ina": 2259, + "atabase": 2260, + "Add": 2261, + "Ġcontent": 2262, + ".print": 2263, + "signed": 2264, + "ric": 2265, + ".\"ĊĊ": 2266, + "Ġfa": 2267, + "!ĊĊ": 2268, + "-f": 2269, + "ived": 2270, + "Ġquest": 2271, + ".ex": 2272, + "Ġfloat": 2273, + "Ġdevelop": 2274, + "оÐ": 2275, + "Map": 2276, + "ading": 2277, + "Ġposs": 2278, + "UE": 2279, + "namespace": 2280, + "_O": 2281, + "ĉb": 2282, + ".Get": 2283, + ">(": 2284, + "json": 2285, + "etails": 2286, + "66": 2287, + "Ġtoo": 2288, + "Ġextends": 2289, + "ĠNone": 2290, + "Ġfore": 2291, + "(String": 2292, + "format": 2293, + "Ġgreat": 2294, + "inter": 2295, + "cale": 2296, + "Ñģ": 2297, + "ron": 2298, + "iving": 2299, + "Ent": 2300, + "ency": 2301, + "xt": 2302, + "oy": 2303, + "05": 2304, + "Ġmonth": 2305, + "Ġhapp": 2306, + "Ġsuper": 2307, + "bar": 2308, + "default": 2309, + "_de": 2310, + "ords": 2311, + "ln": 2312, + "({Ċ": 2313, + "ĠInd": 2314, + "ases": 2315, + "Ġtitle": 2316, + "Ġcontext": 2317, + "08": 2318, + "oh": 2319, + "-p": 2320, + "Em": 2321, + "Ġmet": 2322, + "Test": 2323, + "Ġlife": 2324, + "_v": 2325, + "ĠUS": 2326, + "UI": 2327, + "ocation": 2328, + "md": 2329, + "Ġ[Ċ": 2330, + "Ġ]": 2331, + "sw": 2332, + "Ġincre": 2333, + "script": 2334, + "ential": 2335, + "ways": 2336, + ".de": 2337, + "Ġsrc": 2338, + "Ġcatch": 2339, + "ĠAmeric": 2340, + "//Ċ": 2341, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2342, + "Ġpay": 2343, + "plit": 2344, + "âĢĶ": 2345, + "Ġcoun": 2346, + "obj": 2347, + ".php": 2348, + "Ġchange": 2349, + "ething": 2350, + "'re": 2351, + "aster": 2352, + "los": 2353, + "lation": 2354, + "ĠĠĊ": 2355, + "Le": 2356, + "ä": 2357, + "({": 2358, + "ready": 2359, + "ĠNo": 2360, + "Ġposition": 2361, + "Ġold": 2362, + "Ġbook": 2363, + "abled": 2364, + "bug": 2365, + "202": 2366, + "Hand": 2367, + "};ĊĊ": 2368, + "isplay": 2369, + "aving": 2370, + "04": 2371, + "Ġgover": 2372, + "Ġversion": 2373, + "System": 2374, + "nect": 2375, + "response": 2376, + "Style": 2377, + "Up": 2378, + "angu": 2379, + "Ġthree": 2380, + "init": 2381, + "ero": 2382, + "Ġlaw": 2383, + "endif": 2384, + "Ġbase": 2385, + "email": 2386, + "(l": 2387, + "_V": 2388, + "Ġconf": 2389, + "ATE": 2390, + "Ġduring": 2391, + "tes": 2392, + "Ġconsole": 2393, + "ĠPr": 2394, + "Ġspe": 2395, + "ves": 2396, + "65": 2397, + "path": 2398, + "ialog": 2399, + "dition": 2400, + "_to": 2401, + "ards": 2402, + "Ġagainst": 2403, + "etwork": 2404, + "ĠPh": 2405, + "_L": 2406, + "cur": 2407, + "imit": 2408, + "With": 2409, + "Ġpower": 2410, + "ium": 2411, + "';ĊĊ": 2412, + "Ġwom": 2413, + "left": 2414, + "ources": 2415, + "atri": 2416, + "ĠIm": 2417, + "ĠMan": 2418, + "orth": 2419, + "${": 2420, + "88": 2421, + "quals": 2422, + "ese": 2423, + "_size": 2424, + "Ġiss": 2425, + "otal": 2426, + "-g": 2427, + "ique": 2428, + "rame": 2429, + "Ġwidth": 2430, + "erg": 2431, + ")(": 2432, + "ittle": 2433, + "TR": 2434, + "ĠThey": 2435, + "ences": 2436, + "02": 2437, + "rl": 2438, + "ons": 2439, + "Ġlabel": 2440, + ".y": 2441, + "-t": 2442, + "update": 2443, + "anel": 2444, + "sc": 2445, + ".to": 2446, + "Ġproject": 2447, + "ü": 2448, + "Ġelement": 2449, + "Ġsuccess": 2450, + "ĉĉĊ": 2451, + ".sh": 2452, + "ram": 2453, + "ched": 2454, + "())Ċ": 2455, + "Ġ(Ċ": 2456, + "Ġdate": 2457, + "Ġtot": 2458, + "_ST": 2459, + "All": 2460, + "ification": 2461, + "ĉvar": 2462, + "Ġtri": 2463, + "chem": 2464, + "my": 2465, + "Ġbig": 2466, + "ĠAd": 2467, + "ĠAt": 2468, + "ots": 2469, + "num": 2470, + "Act": 2471, + "Ġmap": 2472, + "era": 2473, + "cope": 2474, + ".$": 2475, + ",âĢĿ": 2476, + "Ġpop": 2477, + "Ġfew": 2478, + "Ġlen": 2479, + "uid": 2480, + "eters": 2481, + "ules": 2482, + "ÃŃ": 2483, + "source": 2484, + "https": 2485, + "Ġdem": 2486, + "Ġear": 2487, + "################": 2488, + "Ġmatch": 2489, + "ories": 2490, + "49": 2491, + "aces": 2492, + "ĠCl": 2493, + "Ġnode": 2494, + "78": 2495, + "irc": 2496, + "local": 2497, + "unity": 2498, + "};Ċ": 2499, + "Ġanother": 2500, + "<<": 2501, + "ogle": 2502, + "Ġsit": 2503, + "ework": 2504, + "TE": 2505, + ".I": 2506, + "NS": 2507, + "ology": 2508, + "ought": 2509, + ".Cont": 2510, + ">>": 2511, + "Ġcare": 2512, + "state": 2513, + "ĉprivate": 2514, + "Ġeffect": 2515, + "++)": 2516, + "_file": 2517, + "ending": 2518, + "Line": 2519, + "For": 2520, + "ior": 2521, + "ĠSc": 2522, + "Ġfun": 2523, + ".Size": 2524, + "ĉelse": 2525, + "])": 2526, + "start": 2527, + "vious": 2528, + "Ġ},": 2529, + "ours": 2530, + "Ġleg": 2531, + "Ġservice": 2532, + "Ġsince": 2533, + "iron": 2534, + "Label": 2535, + "Ġnon": 2536, + "Ġlos": 2537, + "iction": 2538, + "Ġfull": 2539, + "acter": 2540, + "board": 2541, + "gress": 2542, + "Ġturn": 2543, + "ither": 2544, + "09": 2545, + ".size": 2546, + "Ġbody": 2547, + "resh": 2548, + "eturn": 2549, + "199": 2550, + "(_": 2551, + "yles": 2552, + "ormal": 2553, + "pi": 2554, + "Ġsomething": 2555, + "!--": 2556, + "uint": 2557, + "Ġprodu": 2558, + "Ġstand": 2559, + "Ġproble": 2560, + "Ġavailable": 2561, + "mt": 2562, + "ĠBl": 2563, + "Ġ...": 2564, + "Ġblock": 2565, + "Input": 2566, + "Ġkeep": 2567, + "Count": 2568, + "open": 2569, + "Ġ['": 2570, + "Ġthrow": 2571, + "uilder": 2572, + "Action": 2573, + "Ġthings": 2574, + "True": 2575, + "Ġurl": 2576, + "ĠBo": 2577, + "printf": 2578, + "Ġred": 2579, + "js": 2580, + ".create": 2581, + "ĠOr": 2582, + "Status": 2583, + "Instance": 2584, + "Ġcontrol": 2585, + "Ġcome": 2586, + "Ġcustom": 2587, + "location": 2588, + "07": 2589, + "model": 2590, + "ĠčĊ": 2591, + "Ġsource": 2592, + "Ġeas": 2593, + ".out": 2594, + "]ĊĊ": 2595, + "oney": 2596, + "Ġawait": 2597, + "Ġpartic": 2598, + "AP": 2599, + "ublish": 2600, + "odes": 2601, + "_pro": 2602, + "ply": 2603, + "riter": 2604, + "Ġprov": 2605, + "Ġmill": 2606, + "HT": 2607, + "])Ċ": 2608, + "Ġchang": 2609, + "Ġask": 2610, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2611, + "Ġoutput": 2612, + "Ġemail": 2613, + "68": 2614, + ".push": 2615, + "Ġ}čĊčĊ": 2616, + "ination": 2617, + "47": 2618, + "atrix": 2619, + "Table": 2620, + "uccess": 2621, + "]);Ċ": 2622, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2623, + "Ġdisc": 2624, + "([": 2625, + "Ġbusiness": 2626, + "height": 2627, + ".html": 2628, + "ta": 2629, + "field": 2630, + "Ġrequired": 2631, + "_R": 2632, + "Ġgovern": 2633, + "}čĊčĊ": 2634, + "lex": 2635, + "500": 2636, + ".,": 2637, + "ĠSet": 2638, + "urch": 2639, + "///": 2640, + "ts": 2641, + "af": 2642, + "Ġmight": 2643, + "istory": 2644, + "Str": 2645, + "Ġnever": 2646, + "Response": 2647, + "arse": 2648, + "ada": 2649, + "ĠHow": 2650, + "Ġ*)": 2651, + "Ġ;": 2652, + "Ġhard": 2653, + "Ad": 2654, + "Ġintern": 2655, + "used": 2656, + "(data": 2657, + "mod": 2658, + "annel": 2659, + "Ġnp": 2660, + "ugg": 2661, + "Ġ/>Ċ": 2662, + "Ġcalled": 2663, + "body": 2664, + "Ġcho": 2665, + "(r": 2666, + "_set": 2667, + "ird": 2668, + "Ġ>=": 2669, + "Ġ};Ċ": 2670, + "Ġoptions": 2671, + "ĠGener": 2672, + "Ġheight": 2673, + "Point": 2674, + "You": 2675, + "ety": 2676, + "Click": 2677, + "Ġsmall": 2678, + "Ġide": 2679, + "Ġaccess": 2680, + "anguage": 2681, + "Ġprotected": 2682, + "Ġjob": 2683, + "ĠThere": 2684, + "Def": 2685, + "Ġaddress": 2686, + "Ġuint": 2687, + "Not": 2688, + "oo": 2689, + "aps": 2690, + "
&": 5909, + "CON": 5910, + "Ġrepl": 5911, + "Ġregular": 5912, + "Storage": 5913, + "ramework": 5914, + "Ġgoal": 5915, + "Ġtouch": 5916, + ".widget": 5917, + "Ġbuilt": 5918, + "des": 5919, + "Part": 5920, + "(re": 5921, + "Ġworth": 5922, + "hib": 5923, + "game": 5924, + "91": 5925, + "192": 5926, + "Ġв": 5927, + "acion": 5928, + "ĠWhite": 5929, + "(type": 5930, + "(`": 5931, + "81": 5932, + "Ġnatural": 5933, + "Ġinj": 5934, + "Ġcalcul": 5935, + "ĠApril": 5936, + ".List": 5937, + "Ġassociated": 5938, + "ĉSystem": 5939, + "~~": 5940, + "=[": 5941, + "Ġstorage": 5942, + "Ġbytes": 5943, + "Ġtravel": 5944, + "Ġsou": 5945, + "Ġpassed": 5946, + "!=": 5947, + "ascript": 5948, + ".open": 5949, + "Ġgrid": 5950, + "Ġbus": 5951, + "Ġrecogn": 5952, + "Ab": 5953, + "Ġhon": 5954, + "ĠCenter": 5955, + "Ġprec": 5956, + "build": 5957, + "73": 5958, + "HTML": 5959, + "ĠSan": 5960, + "Ġcountries": 5961, + "aled": 5962, + "token": 5963, + "kt": 5964, + "Ġqual": 5965, + "Last": 5966, + "adow": 5967, + "Ġmanufact": 5968, + "idad": 5969, + "jango": 5970, + "Next": 5971, + "xf": 5972, + ".a": 5973, + "Ġporno": 5974, + "ĠPM": 5975, + "erve": 5976, + "iting": 5977, + "_th": 5978, + "ci": 5979, + "=None": 5980, + "gs": 5981, + "Ġlogin": 5982, + "atives": 5983, + "']);Ċ": 5984, + "Äħ": 5985, + "Ġill": 5986, + "IA": 5987, + "children": 5988, + "DO": 5989, + "Ġlevels": 5990, + "Ġ{{": 5991, + "Ġlooks": 5992, + "Ġ\"#": 5993, + "ToString": 5994, + "Ġnecessary": 5995, + "ĠĠĠĊ": 5996, + "cell": 5997, + "Entry": 5998, + "Ġ'#": 5999, + "Ġextrem": 6000, + "Selector": 6001, + "Ġplaceholder": 6002, + "Load": 6003, + "Ġreleased": 6004, + "ORE": 6005, + "Enumer": 6006, + "ĠTV": 6007, + "SET": 6008, + "inq": 6009, + "Press": 6010, + "ĠDepartment": 6011, + "Ġproperties": 6012, + "Ġrespond": 6013, + "Search": 6014, + "ael": 6015, + "Ġrequ": 6016, + "ĠBook": 6017, + "/Ċ": 6018, + "(st": 6019, + "Ġfinancial": 6020, + "icket": 6021, + "_input": 6022, + "Ġthreat": 6023, + "(in": 6024, + "Strip": 6025, + "ìĿ": 6026, + "ção": 6027, + "71": 6028, + "Ġevidence": 6029, + "));": 6030, + "ĠBro": 6031, + "Ġ[];Ċ": 6032, + "Ġou": 6033, + "buf": 6034, + "Script": 6035, + "dat": 6036, + "Ġrule": 6037, + "#import": 6038, + "=\"/": 6039, + "Serial": 6040, + "Ġstarting": 6041, + "[index": 6042, + "ae": 6043, + "Ġcontrib": 6044, + "session": 6045, + "_new": 6046, + "utable": 6047, + "ober": 6048, + "Ġ\"./": 6049, + "Ġlogger": 6050, + "Ġrecently": 6051, + "Ġreturned": 6052, + "ččĊ": 6053, + ")))Ċ": 6054, + "itions": 6055, + "Ġseek": 6056, + "Ġcommunic": 6057, + "Ġ\".": 6058, + "Ġusername": 6059, + "ECT": 6060, + "DS": 6061, + "Ġotherwise": 6062, + "ĠGerman": 6063, + ".aw": 6064, + "Adapter": 6065, + "ixel": 6066, + "Ġsystems": 6067, + "Ġdrop": 6068, + "83": 6069, + "Ġstructure": 6070, + "Ġ$(\"#": 6071, + "encies": 6072, + "anning": 6073, + "ĠLink": 6074, + "ĠResponse": 6075, + "Ġstri": 6076, + "ż": 6077, + "ĠDB": 6078, + "æĹ": 6079, + "android": 6080, + "submit": 6081, + "otion": 6082, + "92": 6083, + "(@": 6084, + ".test": 6085, + "82": 6086, + "ĊĊĊĊĊĊĊĊ": 6087, + "];čĊ": 6088, + "Ġdirectly": 6089, + "Ġ\"%": 6090, + "ris": 6091, + "elta": 6092, + "AIL": 6093, + "){čĊ": 6094, + "mine": 6095, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 6096, + "(k": 6097, + "bon": 6098, + "asic": 6099, + "pite": 6100, + "___": 6101, + "Max": 6102, + "Ġerrors": 6103, + "ĠWhile": 6104, + "Ġarguments": 6105, + "Ġensure": 6106, + "Right": 6107, + "-based": 6108, + "Web": 6109, + "Ġ-=": 6110, + "Ġintrodu": 6111, + "ĠInst": 6112, + "ĠWash": 6113, + "ordin": 6114, + "join": 6115, + "Database": 6116, + "Ġgrad": 6117, + "Ġusually": 6118, + "ITE": 6119, + "Props": 6120, + "?>Ċ": 6121, + "ĠGo": 6122, + "@Override": 6123, + "REF": 6124, + "Ġip": 6125, + "ĠAustral": 6126, + "Ġist": 6127, + "ViewById": 6128, + "Ġserious": 6129, + "Ġcustomer": 6130, + ".prototype": 6131, + "odo": 6132, + "cor": 6133, + "Ġdoor": 6134, + "ĠWITHOUT": 6135, + "Ġplant": 6136, + "Ġbegan": 6137, + "Ġdistance": 6138, + "()).": 6139, + "Ġchance": 6140, + "Ġord": 6141, + "came": 6142, + "pragma": 6143, + "Ġprotect": 6144, + "ragment": 6145, + "ĠNode": 6146, + "ening": 6147, + "Ñĩ": 6148, + "Ġroute": 6149, + "ĠSchool": 6150, + "hi": 6151, + "Ġneighb": 6152, + "After": 6153, + "licit": 6154, + "Ġcontr": 6155, + "Ġprimary": 6156, + "AA": 6157, + ".WriteLine": 6158, + "utils": 6159, + "Ġbi": 6160, + "Red": 6161, + ".Linq": 6162, + ".object": 6163, + "Ġleaders": 6164, + "unities": 6165, + "Ġgun": 6166, + "onth": 6167, + "ĠDev": 6168, + "FILE": 6169, + "Ġcomments": 6170, + "_len": 6171, + "arrow": 6172, + "amount": 6173, + "Range": 6174, + "sert": 6175, + "GridView": 6176, + "Ġupdated": 6177, + "ĠMo": 6178, + "Ġinform": 6179, + "ociety": 6180, + "ala": 6181, + "Access": 6182, + "Ġhab": 6183, + "Ġcreat": 6184, + "_arg": 6185, + "ĠJanuary": 6186, + "ĠDay": 6187, + "\")čĊ": 6188, + "uple": 6189, + "document": 6190, + "gorith": 6191, + "menu": 6192, + "ĠOver": 6193, + "bb": 6194, + ".title": 6195, + "_out": 6196, + "Ġled": 6197, + "uri": 6198, + "Ġ?>": 6199, + "gl": 6200, + "Ġbank": 6201, + "ayment": 6202, + "ĉprintf": 6203, + "MD": 6204, + "Ġsample": 6205, + "Ġhands": 6206, + "ĠVersion": 6207, + "uario": 6208, + "Ġoffers": 6209, + "ityEngine": 6210, + "Ġshape": 6211, + "Ġsleep": 6212, + "_point": 6213, + "Settings": 6214, + "Ġachie": 6215, + "Ġsold": 6216, + "ota": 6217, + ".bind": 6218, + "Am": 6219, + "Ġsafe": 6220, + "Store": 6221, + "Ġshared": 6222, + "Ġpriv": 6223, + "_VAL": 6224, + "Ġsens": 6225, + "){": 6226, + "Ġremember": 6227, + "shared": 6228, + "element": 6229, + "Ġshoot": 6230, + "Vert": 6231, + "cout": 6232, + "Ġenv": 6233, + "_label": 6234, + "Ġ>Ċ": 6235, + "run": 6236, + "Ġscene": 6237, + "(array": 6238, + "device": 6239, + "_title": 6240, + "agon": 6241, + "]čĊ": 6242, + "aby": 6243, + "Ġbecame": 6244, + "boolean": 6245, + "Ġpark": 6246, + "ĠCode": 6247, + "upload": 6248, + "riday": 6249, + "ĠSeptember": 6250, + "Fe": 6251, + "Ġsen": 6252, + "cing": 6253, + "FL": 6254, + "Col": 6255, + "uts": 6256, + "_page": 6257, + "inn": 6258, + "Ġimplied": 6259, + "aling": 6260, + "Ġyourself": 6261, + ".Count": 6262, + "conf": 6263, + "Ġaud": 6264, + "_init": 6265, + ".)": 6266, + "Ġwrote": 6267, + "003": 6268, + "NG": 6269, + ".Error": 6270, + "ä»": 6271, + ".for": 6272, + "Ġequal": 6273, + "ĠRequest": 6274, + "Ġserial": 6275, + "Ġallows": 6276, + "XX": 6277, + "Ġmiddle": 6278, + "chor": 6279, + "195": 6280, + "94": 6281, + "ø": 6282, + "erval": 6283, + ".Column": 6284, + "reading": 6285, + "Ġescort": 6286, + "ĠAugust": 6287, + "Ġquickly": 6288, + "Ġweap": 6289, + "ĠCG": 6290, + "ropri": 6291, + "ho": 6292, + "Ġcop": 6293, + "(struct": 6294, + "ĠBig": 6295, + "Ġvs": 6296, + "Ġfrequ": 6297, + ".Value": 6298, + "Ġactions": 6299, + "Ġproper": 6300, + "Ġinn": 6301, + "Ġobjects": 6302, + "Ġmatrix": 6303, + "avascript": 6304, + "Ġones": 6305, + ".group": 6306, + "Ġgreen": 6307, + "Ġpaint": 6308, + "ools": 6309, + "ycl": 6310, + "encode": 6311, + "olt": 6312, + "comment": 6313, + ".api": 6314, + "Dir": 6315, + "Ġune": 6316, + "izont": 6317, + ".position": 6318, + "Ġdesigned": 6319, + "_val": 6320, + "avi": 6321, + "iring": 6322, + "tab": 6323, + "Ġlayer": 6324, + "Ġviews": 6325, + "Ġreve": 6326, + "rael": 6327, + "ĠON": 6328, + "rics": 6329, + "160": 6330, + "np": 6331, + "Ġcore": 6332, + "());čĊ": 6333, + "Main": 6334, + "Ġexpert": 6335, + "ĉĉčĊ": 6336, + "_en": 6337, + "Ġ/>": 6338, + "utter": 6339, + "IAL": 6340, + "ails": 6341, + "ĠKing": 6342, + "*/ĊĊ": 6343, + "ĠMet": 6344, + "_end": 6345, + "addr": 6346, + "ora": 6347, + "Ġir": 6348, + "Min": 6349, + "Ġsurpr": 6350, + "Ġrepe": 6351, + "Ġdirectory": 6352, + "PUT": 6353, + "-S": 6354, + "Ġelection": 6355, + "haps": 6356, + ".pre": 6357, + "cm": 6358, + "Values": 6359, + "Ġ\"Ċ": 6360, + "column": 6361, + "ivil": 6362, + "Login": 6363, + "inue": 6364, + "93": 6365, + "Ġbeautiful": 6366, + "Ġsecret": 6367, + "(event": 6368, + "Ġchat": 6369, + "ums": 6370, + "Ġorigin": 6371, + "Ġeffects": 6372, + "Ġmanagement": 6373, + "illa": 6374, + "tk": 6375, + "Ġsetting": 6376, + "ĠCour": 6377, + "Ġmassage": 6378, + "ĉend": 6379, + "Ġhappy": 6380, + "Ġfinish": 6381, + "Ġcamera": 6382, + "ĠVer": 6383, + "ĠDemocr": 6384, + "ĠHer": 6385, + "(Q": 6386, + "cons": 6387, + "ita": 6388, + "Ġ'.": 6389, + "{}": 6390, + "ĉC": 6391, + "Ġstuff": 6392, + "194": 6393, + "Ġ:Ċ": 6394, + "ĠAR": 6395, + "Task": 6396, + "hidden": 6397, + "eros": 6398, + "IGN": 6399, + "atio": 6400, + "ĠHealth": 6401, + "olute": 6402, + "Enter": 6403, + "'>": 6404, + "ĠTwitter": 6405, + "ĠCounty": 6406, + "scribe": 6407, + "Ġ=>Ċ": 6408, + "Ġhy": 6409, + "fit": 6410, + "Ġmilitary": 6411, + "Ġsale": 6412, + "required": 6413, + "non": 6414, + "bootstrap": 6415, + "hold": 6416, + "rim": 6417, + "-old": 6418, + "ĠDown": 6419, + "Ġmention": 6420, + "contact": 6421, + "_group": 6422, + "oday": 6423, + "Ġtown": 6424, + "Ġsolution": 6425, + "uate": 6426, + "elling": 6427, + "]->": 6428, + "otes": 6429, + "ental": 6430, + "omen": 6431, + "ospital": 6432, + "ĠSup": 6433, + "_EN": 6434, + "Ġslow": 6435, + "SESSION": 6436, + "Ġblue": 6437, + "ago": 6438, + "Ġlives": 6439, + "Ġ^": 6440, + ".un": 6441, + "inst": 6442, + "enge": 6443, + "Ġcustomers": 6444, + "Ġcast": 6445, + "udget": 6446, + "ï¼ģ": 6447, + "icens": 6448, + "Ġdetermin": 6449, + "Selected": 6450, + "_pl": 6451, + "ueue": 6452, + "Ġdark": 6453, + "//ĊĊ": 6454, + "si": 6455, + "thern": 6456, + "ĠJapan": 6457, + "/w": 6458, + "PU": 6459, + "ĠEast": 6460, + "ovie": 6461, + "Ġpackage": 6462, + "Ġnor": 6463, + "Ġapi": 6464, + "bot": 6465, + "\"];Ċ": 6466, + "_post": 6467, + "ulate": 6468, + "Ġclub": 6469, + "'));Ċ": 6470, + "Ġloop": 6471, + "PIO": 6472, + "ione": 6473, + "shot": 6474, + "Initial": 6475, + "Ġplayed": 6476, + "register": 6477, + "rought": 6478, + "_max": 6479, + "acement": 6480, + "match": 6481, + "raphics": 6482, + "AST": 6483, + "Ġexisting": 6484, + "Ġcomplex": 6485, + "DA": 6486, + ".Ch": 6487, + ".common": 6488, + "mo": 6489, + "Ġ'../../": 6490, + "ito": 6491, + "Ġanalysis": 6492, + "Ġdeliver": 6493, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 6494, + "idx": 6495, + "Ãł": 6496, + "ongo": 6497, + "ĠEnglish": 6498, + "Ċ": 10197, + "_default": 10198, + "ĠDatabase": 10199, + "rep": 10200, + "ESS": 10201, + "nergy": 10202, + ".Find": 10203, + "_mask": 10204, + "Ġrise": 10205, + "Ġkernel": 10206, + "::$": 10207, + ".Q": 10208, + "Ġoffering": 10209, + "decl": 10210, + "ĠCS": 10211, + "Ġlisted": 10212, + "Ġmostly": 10213, + "enger": 10214, + "Ġblocks": 10215, + "olo": 10216, + "Ġgoverning": 10217, + "\\F": 10218, + "Ġconcent": 10219, + ".getText": 10220, + "Ġmb": 10221, + "Ġoccurred": 10222, + "Ġchanging": 10223, + "Scene": 10224, + "_CODE": 10225, + "Beh": 10226, + "\"The": 10227, + "Ġtile": 10228, + "ĠAssociation": 10229, + "ĉP": 10230, + "alty": 10231, + "_ad": 10232, + "odies": 10233, + "iated": 10234, + "Ġprepared": 10235, + "possible": 10236, + "Ġmort": 10237, + "TEST": 10238, + "142": 10239, + "Ġignore": 10240, + "Ġcalc": 10241, + "Ġrs": 10242, + "ĠassertEquals": 10243, + "Ġsz": 10244, + "ĠTHIS": 10245, + ".\"Ċ": 10246, + "Ġcanvas": 10247, + "java": 10248, + "Ġdut": 10249, + "VALID": 10250, + ".sql": 10251, + ".input": 10252, + "Ġaux": 10253, + "Sup": 10254, + "Ġartist": 10255, + "Vec": 10256, + "_TIME": 10257, + ".stringify": 10258, + "etween": 10259, + "ĠCategory": 10260, + "Ġ[-": 10261, + "ĠDevExpress": 10262, + "ĠJul": 10263, + "Ġring": 10264, + ".ed": 10265, + "YY": 10266, + "Let": 10267, + "TextField": 10268, + "Ġflat": 10269, + "_print": 10270, + "ĠOTHER": 10271, + "adian": 10272, + "Ġchecked": 10273, + "ele": 10274, + "Align": 10275, + "standing": 10276, + "Ġ[],": 10277, + "Ġlab": 10278, + "ucky": 10279, + "ĠChristmas": 10280, + "(image": 10281, + ".module": 10282, + "Ġlots": 10283, + "Ġslightly": 10284, + "(final": 10285, + "erge": 10286, + "è¿": 10287, + "147": 10288, + "ĠPolice": 10289, + "143": 10290, + "ĠRight": 10291, + "Ġaward": 10292, + "ĠOS": 10293, + "Ġ{}ĊĊ": 10294, + "Ġptr": 10295, + "oves": 10296, + "icated": 10297, + "ем": 10298, + "Ġmanage": 10299, + "oliday": 10300, + "Amount": 10301, + "oolStrip": 10302, + "tbody": 10303, + "Nav": 10304, + "wrap": 10305, + "BB": 10306, + "Ġwatching": 10307, + "arios": 10308, + "Ġoptional": 10309, + "_K": 10310, + "ĠLicensed": 10311, + ".Map": 10312, + "Timer": 10313, + "ĠAP": 10314, + "ĠRev": 10315, + "(o": 10316, + ",c": 10317, + "umin": 10318, + "etailed": 10319, + "ĠHy": 10320, + "Ġblank": 10321, + "agger": 10322, + "ĠSelf": 10323, + "()[": 10324, + ".make": 10325, + "earn": 10326, + "channel": 10327, + ";Ċ": 10342, + "World": 10343, + "Ġpython": 10344, + "Ġlif": 10345, + "Ġtrav": 10346, + "Ġconven": 10347, + "company": 10348, + "ĠClub": 10349, + "138": 10350, + "Ver": 10351, + "Btn": 10352, + "Ġzone": 10353, + "products": 10354, + "ĠEduc": 10355, + "Ġverify": 10356, + "ĠMil": 10357, + "ono": 10358, + "]);ĊĊ": 10359, + "ENCE": 10360, + "Ġpacket": 10361, + "Ġcer": 10362, + "Ġenumer": 10363, + "Ġpars": 10364, + "formed": 10365, + "Ġoccup": 10366, + "tre": 10367, + "Ġexercise": 10368, + "Day": 10369, + "_sum": 10370, + "Ġasking": 10371, + "aption": 10372, + "Ġorders": 10373, + "Ġspending": 10374, + "ĠERR": 10375, + ".Dis": 10376, + "ĠUtil": 10377, + "âĢľI": 10378, + "\\'": 10379, + "?)": 10380, + "/>Ċ": 10381, + "Ġemot": 10382, + "Ġinfluence": 10383, + "ĠAfrica": 10384, + "atters": 10385, + "Ùħ": 10386, + ".session": 10387, + "Ġchief": 10388, + "ĉĉĉĉĉĉĉĉĉĉĉ": 10389, + "Ġtom": 10390, + "cluded": 10391, + "serial": 10392, + "_handler": 10393, + ".Type": 10394, + "aped": 10395, + "Ġpolicies": 10396, + "-ex": 10397, + "-tr": 10398, + "blank": 10399, + "merce": 10400, + "Ġcoverage": 10401, + "Ġrc": 10402, + "_matrix": 10403, + "_box": 10404, + "Ġcharges": 10405, + "ĠBoston": 10406, + "Pe": 10407, + "Ġcircum": 10408, + "Ġfilled": 10409, + "148": 10410, + "Ġnorth": 10411, + "ictureBox": 10412, + "ĉres": 10413, + "è®": 10414, + "Ġtermin": 10415, + "Ġ[â̦": 10416, + "IRECT": 10417, + "Ġber": 10418, + "Ġ\"../../": 10419, + "retch": 10420, + ".code": 10421, + "_col": 10422, + "ĠGovernment": 10423, + "Ġargv": 10424, + "ĠLord": 10425, + "asi": 10426, + "Exec": 10427, + "ĉlet": 10428, + "vertis": 10429, + "Ġdiscussion": 10430, + "enance": 10431, + "outube": 10432, + "typeof": 10433, + "Ġserved": 10434, + "ĠPut": 10435, + "ĉx": 10436, + "Ġsweet": 10437, + "Before": 10438, + "ategy": 10439, + ".of": 10440, + "ĠMaterial": 10441, + "Sort": 10442, + "ONT": 10443, + "igital": 10444, + "Why": 10445, + "Ġsust": 10446, + "Ġç": 10447, + "abet": 10448, + "Ġsegment": 10449, + "Ġ[],Ċ": 10450, + "ĠMuslim": 10451, + "ĠfindViewById": 10452, + "cut": 10453, + "_TEXT": 10454, + "ĠMary": 10455, + "Ġloved": 10456, + "Ġlie": 10457, + "ĠJO": 10458, + "Ġisset": 10459, + "month": 10460, + "Ġprime": 10461, + "ti": 10462, + "ĠCarol": 10463, + "Use": 10464, + "146": 10465, + "ĠPop": 10466, + "ĠSave": 10467, + "Interval": 10468, + "execute": 10469, + "dy": 10470, + "ĠIran": 10471, + "_cont": 10472, + "ĉT": 10473, + "Ġphase": 10474, + "checkbox": 10475, + "week": 10476, + "Ġhide": 10477, + "Ġtil": 10478, + "Ġju": 10479, + "Custom": 10480, + "burg": 10481, + "/M": 10482, + "TON": 10483, + "Ġquant": 10484, + "Ġrub": 10485, + "ixels": 10486, + "Ġinstalled": 10487, + "Ġdump": 10488, + "Ġproperly": 10489, + "(List": 10490, + "Ġdecide": 10491, + "apply": 10492, + "Has": 10493, + "Ġkeeping": 10494, + "Ġcitizens": 10495, + "Ġjoint": 10496, + "pool": 10497, + "Socket": 10498, + "_op": 10499, + "Ġweapon": 10500, + "gnore": 10501, + "ĠExec": 10502, + "otten": 10503, + "ĠMS": 10504, + "Ġ(-": 10505, + "ĠReview": 10506, + "Ġexamples": 10507, + "Ġtight": 10508, + "!(": 10509, + "DP": 10510, + "ĠMessageBox": 10511, + "Ġphotograph": 10512, + "164": 10513, + "URI": 10514, + "ét": 10515, + "low": 10516, + "ĠGrand": 10517, + ".persistence": 10518, + "Ġmaintain": 10519, + "Ġnums": 10520, + "Ġzip": 10521, + "ials": 10522, + "ĠGets": 10523, + "peg": 10524, + "ĠBuffer": 10525, + "~~~~": 10526, + "rastructure": 10527, + "ĠPL": 10528, + "uen": 10529, + "obby": 10530, + "sizeof": 10531, + "Ġpic": 10532, + "Ġseed": 10533, + "Ġexperienced": 10534, + "Ġodd": 10535, + "Ġkick": 10536, + "Ġprocedure": 10537, + "avigator": 10538, + "-on": 10539, + ",j": 10540, + "ĠAlthough": 10541, + "ĠuserId": 10542, + "accept": 10543, + "Blue": 10544, + "IColor": 10545, + "layer": 10546, + "available": 10547, + "Ġends": 10548, + ".table": 10549, + "Ġdataset": 10550, + "bus": 10551, + "Ġexplain": 10552, + "(pro": 10553, + "ĠCommittee": 10554, + "Ġnoted": 10555, + "]:Ċ": 10556, + "Dim": 10557, + "stdio": 10558, + "154": 10559, + ".\",Ċ": 10560, + "_source": 10561, + "181": 10562, + "ĠWeek": 10563, + "ĠEdge": 10564, + "Ġoperating": 10565, + "Ġeste": 10566, + "ipl": 10567, + "330": 10568, + "agination": 10569, + "Ġproceed": 10570, + "Ġanimation": 10571, + ".Models": 10572, + "ĠWatch": 10573, + "iat": 10574, + "Ġoppon": 10575, + "/A": 10576, + "Report": 10577, + "Ġsounds": 10578, + "_buf": 10579, + "IELD": 10580, + "Ġbund": 10581, + "ĉget": 10582, + ".pr": 10583, + "(tmp": 10584, + "Ġkid": 10585, + ">ĊĊĊ": 10586, + "Ġyang": 10587, + "NotFound": 10588, + "ÑĨ": 10589, + "math": 10590, + "@gmail": 10591, + "ĠLIMIT": 10592, + "redients": 10593, + "Ġvent": 10594, + "avigate": 10595, + "Look": 10596, + "Ġreligious": 10597, + "Ġrand": 10598, + "rio": 10599, + "(GL": 10600, + "_ip": 10601, + "uan": 10602, + "iciency": 10603, + "ĠChange": 10604, + ">čĊčĊ": 10605, + "ĠEntity": 10606, + "Ġrencontre": 10607, + "ĠRet": 10608, + "plan": 10609, + "én": 10610, + "BOOL": 10611, + "uries": 10612, + "train": 10613, + "Definition": 10614, + "============": 10615, + "zz": 10616, + "450": 10617, + "Animation": 10618, + "ĠOK": 10619, + "_menu": 10620, + ".bl": 10621, + "_score": 10622, + "Ġacad": 10623, + "(System": 10624, + "Ġrefresh": 10625, + "'=>$": 10626, + ".Graphics": 10627, + "amento": 10628, + "pid": 10629, + "tc": 10630, + "Ġtips": 10631, + "Ġhomes": 10632, + "Ġfuel": 10633, + "âĸ": 10634, + "_helper": 10635, + "ĠĠčĊ": 10636, + "ĠRoom": 10637, + ".Close": 10638, + "_attr": 10639, + "ĠMount": 10640, + "ĠEv": 10641, + "arser": 10642, + "_top": 10643, + "eah": 10644, + "ĠDelete": 10645, + "ãĢį": 10646, + "uke": 10647, + "Ġusage": 10648, + "aria": 10649, + "_dev": 10650, + "Ġtexture": 10651, + "Ġconversation": 10652, + "eper": 10653, + "Bean": 10654, + "done": 10655, + "nonatomic": 10656, + "ĠSecond": 10657, + "Ġshooting": 10658, + "_pre": 10659, + "Components": 10660, + "Ġ]ĊĊ": 10661, + "__,": 10662, + "stitution": 10663, + ".Char": 10664, + ">();ĊĊ": 10665, + "Ġpresented": 10666, + "Ġwa": 10667, + "oker": 10668, + "-ĊĊ": 10669, + "iner": 10670, + "Ġbecoming": 10671, + "Ġincident": 10672, + "Att": 10673, + "162": 10674, + "Ġrevealed": 10675, + "forc": 10676, + "Ġboot": 10677, + ".page": 10678, + "Enumerator": 10679, + "165": 10680, + "_->": 10681, + "Photo": 10682, + "Ġspring": 10683, + ".\",": 10684, + "ĠDictionary": 10685, + "BJECT": 10686, + "Ġlocations": 10687, + "Ġsamples": 10688, + "InputStream": 10689, + "ĠBrown": 10690, + "Ġstats": 10691, + "quality": 10692, + "Ñħ": 10693, + "-dis": 10694, + "Ġhelping": 10695, + "Ġped": 10696, + "224": 10697, + "(se": 10698, + "ĠWho": 10699, + "alian": 10700, + "internal": 10701, + "Ġft": 10702, + ">().": 10703, + "->{": 10704, + "Ġmine": 10705, + "Ġsector": 10706, + "Ġgro": 10707, + "Ġopportunities": 10708, + "Ġü": 10709, + "Ġmp": 10710, + "Ġalleged": 10711, + "Ġdoubt": 10712, + "Mouse": 10713, + "About": 10714, + "_part": 10715, + "Ġchair": 10716, + "Ġstopped": 10717, + "161": 10718, + "loop": 10719, + "entities": 10720, + "Ġapps": 10721, + "ansion": 10722, + "Ġmental": 10723, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10724, + "FR": 10725, + "Ġdefend": 10726, + "care": 10727, + "Ġideal": 10728, + "/api": 10729, + "urface": 10730, + "011": 10731, + "Ġele": 10732, + "ulator": 10733, + "ĠRights": 10734, + "anguages": 10735, + "Ġfunds": 10736, + "Ġadapt": 10737, + "Attributes": 10738, + "Ġdeploy": 10739, + "opts": 10740, + "Ġvalidation": 10741, + "Ġconcerns": 10742, + "uce": 10743, + ".num": 10744, + "ulture": 10745, + "ila": 10746, + "Ġcup": 10747, + "Ġpure": 10748, + ".Fore": 10749, + "183": 10750, + "ĠHashMap": 10751, + ".valueOf": 10752, + "asm": 10753, + "MO": 10754, + "Ġcs": 10755, + "Ġstores": 10756, + "Ġ************************************************************************": 10757, + "Ġcommunication": 10758, + "mem": 10759, + ".EventHandler": 10760, + ".Status": 10761, + "_right": 10762, + ".setOn": 10763, + "Sheet": 10764, + "Ġidentify": 10765, + "enerated": 10766, + "ordered": 10767, + "Ġ\"[": 10768, + "Ġswe": 10769, + "Condition": 10770, + "ĠAccording": 10771, + "Ġprepare": 10772, + "Ġrob": 10773, + "Pool": 10774, + "Ġsport": 10775, + "rv": 10776, + "ĠRouter": 10777, + "Ġalternative": 10778, + "([]": 10779, + "ĠChicago": 10780, + "ipher": 10781, + "ische": 10782, + "ĠDirector": 10783, + "kl": 10784, + "ĠWil": 10785, + "keys": 10786, + "Ġmysql": 10787, + "Ġwelcome": 10788, + "king": 10789, + "ĠManager": 10790, + "Ġcaught": 10791, + ")}Ċ": 10792, + "Score": 10793, + "_PR": 10794, + "Ġsurvey": 10795, + "hab": 10796, + "Headers": 10797, + "ADER": 10798, + "Ġdecor": 10799, + "Ġturns": 10800, + "Ġradius": 10801, + "errupt": 10802, + "Cor": 10803, + "Ġmel": 10804, + "Ġintr": 10805, + "(q": 10806, + "ĠAC": 10807, + "amos": 10808, + "MAX": 10809, + "ĠGrid": 10810, + "ĠJesus": 10811, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10812, + ".DE": 10813, + "Ġts": 10814, + "Ġlinked": 10815, + "free": 10816, + "ĠQt": 10817, + "Ġ/**čĊ": 10818, + "Ġfaster": 10819, + "ctr": 10820, + "_J": 10821, + "DT": 10822, + ".Check": 10823, + "Ġcombination": 10824, + "Ġintended": 10825, + "-the": 10826, + "-type": 10827, + "182": 10828, + "ectors": 10829, + "ami": 10830, + "uting": 10831, + "Ġuma": 10832, + "XML": 10833, + "UCT": 10834, + "Ap": 10835, + "ĠRandom": 10836, + "Ġran": 10837, + ".sort": 10838, + "Ġsorted": 10839, + ".Un": 10840, + "401": 10841, + "_PER": 10842, + "itory": 10843, + "Ġpriority": 10844, + "ĠGal": 10845, + "ĠOld": 10846, + "hot": 10847, + "ĠDisplay": 10848, + "(sub": 10849, + "_TH": 10850, + "_Y": 10851, + "ĠCare": 10852, + "loading": 10853, + "Kind": 10854, + "_handle": 10855, + ",,": 10856, + "rase": 10857, + "_replace": 10858, + ".addEventListener": 10859, + "ĠRT": 10860, + "172": 10861, + "Ġentered": 10862, + "gers": 10863, + "Ġich": 10864, + "(start": 10865, + "205": 10866, + "/app": 10867, + "Ġbrother": 10868, + "Memory": 10869, + "Outlet": 10870, + "Ġutf": 10871, + "prec": 10872, + "Ġnavigation": 10873, + "ORK": 10874, + "Ġdst": 10875, + "Detail": 10876, + "Ġaudience": 10877, + "Ġdur": 10878, + "Ġcluster": 10879, + "unched": 10880, + "Ġ],": 10881, + "Ġcomfortable": 10882, + ".values": 10883, + "ĠTotal": 10884, + "Ġsnap": 10885, + "Ġstandards": 10886, + "Ġperformed": 10887, + "hand": 10888, + "(\"@": 10889, + "åŃ": 10890, + "Ġphil": 10891, + "ibr": 10892, + "trim": 10893, + "Ġforget": 10894, + "157": 10895, + "Ġdoctor": 10896, + ".TextBox": 10897, + "377": 10898, + "icons": 10899, + ",s": 10900, + "ĠOp": 10901, + "Sm": 10902, + "Stop": 10903, + "ĉList": 10904, + "ĉu": 10905, + "Comment": 10906, + "_VERSION": 10907, + ".Xtra": 10908, + "Person": 10909, + "rb": 10910, + "LOB": 10911, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 10912, + "ĠCentral": 10913, + "270": 10914, + "ICK": 10915, + "raq": 10916, + "Ġputting": 10917, + "Ġmd": 10918, + "ĠLove": 10919, + "Program": 10920, + "Border": 10921, + "oor": 10922, + "Ġallowing": 10923, + "after": 10924, + "Ġentries": 10925, + "ĠMaybe": 10926, + "]).": 10927, + "ĠShort": 10928, + ")\\": 10929, + ".now": 10930, + "friend": 10931, + "Ġprefer": 10932, + "ĠGPIO": 10933, + "osis": 10934, + "ĠGameObject": 10935, + "Ġskip": 10936, + "Ġcompetition": 10937, + "_match": 10938, + "lications": 10939, + "_CONT": 10940, + ".groupBox": 10941, + "Ġals": 10942, + "666": 10943, + "\"We": 10944, + "_eq": 10945, + "lan": 10946, + "_search": 10947, + "ĠMusic": 10948, + "asis": 10949, + "Ġbind": 10950, + "ĠIsland": 10951, + "rum": 10952, + "(E": 10953, + "Ġseat": 10954, + "Video": 10955, + "Ġack": 10956, + "reek": 10957, + "={()": 10958, + "Ġrating": 10959, + "Ġrestaurant": 10960, + "456": 10961, + "DEX": 10962, + "(buf": 10963, + "pping": 10964, + "uality": 10965, + "Ġleague": 10966, + "176": 10967, + "Ġfocused": 10968, + "apon": 10969, + "$data": 10970, + "CLUD": 10971, + "CLUDING": 10972, + "Ġabsolute": 10973, + "(query": 10974, + "Ġtells": 10975, + "Ang": 10976, + "Ġcommunities": 10977, + "Ġhonest": 10978, + "oking": 10979, + "Ġapart": 10980, + "arity": 10981, + "/$": 10982, + "_module": 10983, + "ĠEnc": 10984, + ".an": 10985, + ".Config": 10986, + "Cre": 10987, + "Ġshock": 10988, + "ĠArab": 10989, + "IENT": 10990, + "/re": 10991, + "Ġretrie": 10992, + "ycler": 10993, + "isa": 10994, + "ĠOrgan": 10995, + ".graph": 10996, + "Ġí": 10997, + "ĠBAS": 10998, + "Enum": 10999, + "Ġpossibly": 11000, + "ÑĢаÐ": 11001, + "ĠJapanese": 11002, + "Ġcraft": 11003, + "ĠPlace": 11004, + "Ġtalent": 11005, + "Ġfunding": 11006, + "Ġconfirmed": 11007, + "Ġcycle": 11008, + "/x": 11009, + "GE": 11010, + "Ġhearing": 11011, + "Ġplants": 11012, + "Ġmouth": 11013, + "pages": 11014, + "oria": 11015, + "ĠRemove": 11016, + "_total": 11017, + "Ġod": 11018, + "ollapse": 11019, + "door": 11020, + "Ġbought": 11021, + "Ġaddr": 11022, + "ARCH": 11023, + "_dim": 11024, + "dden": 11025, + "Ġdecades": 11026, + "REQUEST": 11027, + "Ġversions": 11028, + "fire": 11029, + "006": 11030, + "Ġmoves": 11031, + "fb": 11032, + "Ġcoffee": 11033, + ".connect": 11034, + "ĠRow": 11035, + "Ġschema": 11036, + "Scope": 11037, + "-Type": 11038, + "Ġfighting": 11039, + "Ġretail": 11040, + "Ġmodified": 11041, + "TF": 11042, + "Files": 11043, + "nie": 11044, + "_command": 11045, + "stone": 11046, + "ĠÑĤ": 11047, + "_thread": 11048, + "Ġbond": 11049, + "ĠDevelopment": 11050, + "Ġpt": 11051, + "FORM": 11052, + "plet": 11053, + "Ġidentified": 11054, + "cpp": 11055, + "206": 11056, + "225": 11057, + "Ġcoding": 11058, + "oked": 11059, + "ĠMaster": 11060, + "IDTH": 11061, + "Ġresidents": 11062, + "redit": 11063, + "ĠPhoto": 11064, + "=-": 11065, + "unte": 11066, + "ateur": 11067, + "159": 11068, + "_STATE": 11069, + "ĠSing": 11070, + "Ġsheet": 11071, + ".val": 11072, + "orse": 11073, + "Ġhers": 11074, + "Ġdetermined": 11075, + "Common": 11076, + "Ġwed": 11077, + "_queue": 11078, + "PH": 11079, + "ĠAtl": 11080, + "cred": 11081, + "/LICENSE": 11082, + "Ġmes": 11083, + "Ġadvanced": 11084, + ".java": 11085, + ".Sh": 11086, + "Go": 11087, + "kill": 11088, + "fp": 11089, + "_settings": 11090, + "Ġpal": 11091, + "Ġtruck": 11092, + "Ġcombined": 11093, + "Ġ\"${": 11094, + "ĠCorpor": 11095, + "Ġjoined": 11096, + "ĠJose": 11097, + "ĠCup": 11098, + "uns": 11099, + "estival": 11100, + "levision": 11101, + "Ġbroken": 11102, + "Ġmarriage": 11103, + "ĠWestern": 11104, + "Ġrepresents": 11105, + "ĠTitle": 11106, + "Ġss": 11107, + ".Ass": 11108, + "ongoose": 11109, + "iento": 11110, + "<>();Ċ": 11111, + "Ġabsolutely": 11112, + "Ġsmooth": 11113, + "TERN": 11114, + "ĠUnless": 11115, + "Word": 11116, + "Ġmerge": 11117, + "igan": 11118, + "ĠVol": 11119, + "Ġnn": 11120, + ".getId": 11121, + "Ġз": 11122, + "171": 11123, + "Ġsexy": 11124, + "Ġseeking": 11125, + "Single": 11126, + ".this": 11127, + "179": 11128, + "Ġkom": 11129, + "bound": 11130, + ";\"": 11131, + "ĠfontSize": 11132, + "_df": 11133, + "Ġinjury": 11134, + "(H": 11135, + "Ġissued": 11136, + "_END": 11137, + ":self": 11138, + "020": 11139, + "Ġpatch": 11140, + "Ġleaves": 11141, + "Ġadopt": 11142, + "FileName": 11143, + "ãĢIJ": 11144, + "Ġexecutive": 11145, + "ĠByte": 11146, + "]))Ċ": 11147, + "Ġnu": 11148, + "outing": 11149, + "cluding": 11150, + "-R": 11151, + ".options": 11152, + "Ġsubstant": 11153, + "avax": 11154, + "ĠBUT": 11155, + "Ġtechnical": 11156, + "Ġtwice": 11157, + "Ġmás": 11158, + "Ġunivers": 11159, + "yr": 11160, + "Ġdrag": 11161, + "ĠDC": 11162, + "Ġsed": 11163, + "Ġbot": 11164, + "ĠPal": 11165, + "ĠHall": 11166, + "forcement": 11167, + "Ġauch": 11168, + ".mod": 11169, + "notation": 11170, + "_files": 11171, + ".line": 11172, + "_flag": 11173, + "[name": 11174, + "Ġresolution": 11175, + "Ġbott": 11176, + "(\"[": 11177, + "ende": 11178, + "(arr": 11179, + "Free": 11180, + "(@\"": 11181, + "ĠDistrict": 11182, + "PEC": 11183, + ":-": 11184, + "Picker": 11185, + "ĠJo": 11186, + "ĠĠĠĠĠĊ": 11187, + "ĠRiver": 11188, + "_rows": 11189, + "Ġhelpful": 11190, + "Ġmassive": 11191, + "---Ċ": 11192, + "Ġmeasures": 11193, + "007": 11194, + "ĠRuntime": 11195, + "Ġworry": 11196, + "ĠSpec": 11197, + "ĉD": 11198, + "ãĢij": 11199, + "Ġ){Ċ": 11200, + "Ġworse": 11201, + "(filename": 11202, + "Ġlay": 11203, + "Ġmagic": 11204, + "ĠTheir": 11205, + "oul": 11206, + "stroy": 11207, + "ĠWhere": 11208, + "280": 11209, + "Ġsudden": 11210, + "Ġdefe": 11211, + "Ġbinding": 11212, + "Ġflight": 11213, + "ĠOnInit": 11214, + "ĠWomen": 11215, + "ĠPolicy": 11216, + "Ġdrugs": 11217, + "ishing": 11218, + "('../": 11219, + "ĠMel": 11220, + "peat": 11221, + "tor": 11222, + "Ġproposed": 11223, + "Ġstated": 11224, + "_RES": 11225, + "Ġeast": 11226, + "212": 11227, + "ĠCONDITION": 11228, + "_desc": 11229, + "Ġwinning": 11230, + "folio": 11231, + "Mapper": 11232, + "ĠPan": 11233, + "ĠAnge": 11234, + ".servlet": 11235, + "Ġcopies": 11236, + "LM": 11237, + "Ġvm": 11238, + "åį": 11239, + "Ġdictionary": 11240, + "Seg": 11241, + "177": 11242, + "elines": 11243, + "ĠSend": 11244, + "Ġiron": 11245, + "ĠFort": 11246, + "166": 11247, + ".domain": 11248, + "Ġdebate": 11249, + "NotNull": 11250, + "eq": 11251, + "acher": 11252, + "lf": 11253, + "ĉfmt": 11254, + "Ġlawy": 11255, + "178": 11256, + "ÄŁ": 11257, + "ĠMen": 11258, + "Ġtrim": 11259, + "(NULL": 11260, + "Ġ!!": 11261, + "Ġpad": 11262, + "Ġfollows": 11263, + "\"][\"": 11264, + "requ": 11265, + "ĠEp": 11266, + ".github": 11267, + "(img": 11268, + "eto": 11269, + "('\\": 11270, + "Services": 11271, + "umbnail": 11272, + "_main": 11273, + "pleted": 11274, + "fortunately": 11275, + "Ġwindows": 11276, + "Ġplane": 11277, + "ĠConnection": 11278, + ".local": 11279, + "uard": 11280, + "}\\": 11281, + "==\"": 11282, + "andon": 11283, + "ĠRoy": 11284, + "west": 11285, + "158": 11286, + "iginal": 11287, + "emies": 11288, + "itz": 11289, + "'):Ċ": 11290, + "ĠPeter": 11291, + "Ġtough": 11292, + "Ġreduced": 11293, + "Ġcalculate": 11294, + "Ġrapid": 11295, + "customer": 11296, + "Ġefficient": 11297, + "Ġmedium": 11298, + "Ġfell": 11299, + ".ref": 11300, + "ĠCas": 11301, + "Ġfeedback": 11302, + "Speed": 11303, + "(output": 11304, + "aje": 11305, + "Ġcategories": 11306, + "Ġfee": 11307, + "};": 11308, + "Ġdeleted": 11309, + "reh": 11310, + "Ġproof": 11311, + "Desc": 11312, + "Build": 11313, + "Ġsides": 11314, + ".ArrayList": 11315, + "-%": 11316, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 11317, + "ر": 11318, + ".match": 11319, + "ли": 11320, + "Ġfeels": 11321, + "Ġachieve": 11322, + "Ġclim": 11323, + "_ON": 11324, + "ĠCD": 11325, + "Ġteacher": 11326, + "_current": 11327, + "bn": 11328, + "_PL": 11329, + "isting": 11330, + "Enable": 11331, + "GEN": 11332, + "Ġtv": 11333, + "Ġsock": 11334, + "Ġplays": 11335, + "Ġdiscount": 11336, + "ĠKE": 11337, + "ĠDebug": 11338, + "Fore": 11339, + "ĠIraq": 11340, + "Ġappearance": 11341, + "Mon": 11342, + "Ġstyled": 11343, + "ĠHuman": 11344, + "iot": 11345, + "ĠHistory": 11346, + "Ġsac": 11347, + "ĠCollection": 11348, + "Ġrecommended": 11349, + ".Selected": 11350, + "Ġorganizations": 11351, + "Ġdiscovered": 11352, + "cohol": 11353, + "adas": 11354, + "ĠThomas": 11355, + "May": 11356, + "Ġconserv": 11357, + "Ġdomin": 11358, + "ĠFollow": 11359, + "ĠSection": 11360, + "ĠThanks": 11361, + "Username": 11362, + "Ġrecipe": 11363, + "Ġwonderful": 11364, + ".sleep": 11365, + "_if": 11366, + "ĉĊĉĊ": 11367, + "orno": 11368, + "Ġru": 11369, + "_target": 11370, + ".\"\"": 11371, + "à¦": 11372, + "EventArgs": 11373, + "Ġinputs": 11374, + "Ġfif": 11375, + "Ġvision": 11376, + "cy": 11377, + "ĠSeries": 11378, + ")(((": 11379, + "Ġtrading": 11380, + "Ġmarker": 11381, + "Begin": 11382, + "Ġtypically": 11383, + "Ġcauses": 11384, + "dropdown": 11385, + "_DEBUG": 11386, + "260": 11387, + "Ġdetect": 11388, + "country": 11389, + "!\");Ċ": 11390, + "ĉR": 11391, + "appy": 11392, + "Ġcref": 11393, + "('<": 11394, + "\"=>": 11395, + "ĠLE": 11396, + "reader": 11397, + "Ġadministr": 11398, + "õ": 11399, + "ucket": 11400, + "Ġfashion": 11401, + ".char": 11402, + "izar": 11403, + "Ġdisable": 11404, + "Ġsuc": 11405, + "ĠLive": 11406, + "issue": 11407, + "Ġmetadata": 11408, + "flags": 11409, + "ĠðŁ": 11410, + "Ġcommitted": 11411, + "Ġva": 11412, + "Ġrough": 11413, + "Ġ'''Ċ": 11414, + "Ġhighlight": 11415, + "_vars": 11416, + "VO": 11417, + "Ġencoding": 11418, + "-Z": 11419, + "_sign": 11420, + "$(\"#": 11421, + "Ġrain": 11422, + "reatest": 11423, + "ĠEND": 11424, + "Selection": 11425, + "Ġcandidates": 11426, + "Ġsav": 11427, + ".Empty": 11428, + "Ġdecisions": 11429, + "Ġcollabor": 11430, + "ridge": 11431, + "feed": 11432, + "ression": 11433, + "Ġpersons": 11434, + "VM": 11435, + "008": 11436, + "ega": 11437, + "_BIT": 11438, + "According": 11439, + "acked": 11440, + "Ġdollars": 11441, + "_loss": 11442, + "ĠCost": 11443, + "}\"Ċ": 11444, + "Notification": 11445, + "Ġprostit": 11446, + "Ġauthority": 11447, + ".rec": 11448, + "Ġspokes": 11449, + "ĠToday": 11450, + "istant": 11451, + "ĠHead": 11452, + "âĢĿ.": 11453, + "ertainment": 11454, + "cean": 11455, + "culate": 11456, + "Ġven": 11457, + "However": 11458, + "_arr": 11459, + "Ġtokens": 11460, + "Graph": 11461, + "ĠJud": 11462, + "ĠVirgin": 11463, + "ĠSerial": 11464, + "unning": 11465, + "Mutable": 11466, + "agers": 11467, + ".csv": 11468, + "Ġdeveloping": 11469, + "Ġinstructions": 11470, + "Ġpromise": 11471, + "Ġrequested": 11472, + "_encode": 11473, + "/\"": 11474, + "ĠIcon": 11475, + "uilt": 11476, + "-day": 11477, + "Ġintelligence": 11478, + ".IS": 11479, + "ĠObservable": 11480, + "ĠHard": 11481, + "Bool": 11482, + "211": 11483, + "idential": 11484, + ".Anchor": 11485, + "Ġselling": 11486, + "CI": 11487, + "AGES": 11488, + "tle": 11489, + "bur": 11490, + "UFFER": 11491, + "RY": 11492, + "Ġbigger": 11493, + "Ġrat": 11494, + "Ġfamous": 11495, + "Ġtypename": 11496, + "Ġexplained": 11497, + "}}Ċ": 11498, + "Ġnuclear": 11499, + "-N": 11500, + "Ġcrisis": 11501, + "ĠEnter": 11502, + "Ġanswers": 11503, + "/${": 11504, + "/pl": 11505, + "Ġsequ": 11506, + "_next": 11507, + "mask": 11508, + "Ġstanding": 11509, + "Ġplenty": 11510, + "ĠCross": 11511, + "ĉret": 11512, + "dro": 11513, + "ĠCast": 11514, + "167": 11515, + "=true": 11516, + "ĠChris": 11517, + "icio": 11518, + "ĠMike": 11519, + "Decimal": 11520, + "addComponent": 11521, + "Len": 11522, + "Ġcock": 11523, + "Ġ#{": 11524, + "URN": 11525, + "": 11657, + "Ġ*=": 11658, + "ĠPS": 11659, + "Ġdangerous": 11660, + "[p": 11661, + "OME": 11662, + "Other": 11663, + "ĠStringBuilder": 11664, + "Points": 11665, + "heading": 11666, + "Ġcurrency": 11667, + "Ġpercentage": 11668, + "_API": 11669, + "Ġclassic": 11670, + "thead": 11671, + "ĠMO": 11672, + "FE": 11673, + "Idx": 11674, + "await": 11675, + "Ġè": 11676, + "Ġaccident": 11677, + "Ġvariant": 11678, + "Ġmyst": 11679, + "ĠLand": 11680, + "ĠBre": 11681, + "Ġharm": 11682, + "ĠAcc": 11683, + "Ġcharged": 11684, + "iones": 11685, + "Visibility": 11686, + "arry": 11687, + "ĠLanguage": 11688, + "Ġwalking": 11689, + "\".ĊĊ": 11690, + "ifer": 11691, + "Ġleadership": 11692, + ".From": 11693, + "ynam": 11694, + "Ġtimestamp": 11695, + "ipt": 11696, + "ĠHas": 11697, + "REFER": 11698, + "ĠIts": 11699, + "Ġlistener": 11700, + "UTE": 11701, + "213": 11702, + "_description": 11703, + "Ġexperiences": 11704, + "Ġcreates": 11705, + "RS": 11706, + "cart": 11707, + "black": 11708, + "Ġchoices": 11709, + "war": 11710, + "750": 11711, + "Ġ'''": 11712, + "Ġordered": 11713, + "Ġevening": 11714, + "Ġpil": 11715, + "Ġtun": 11716, + "ĠBad": 11717, + "(app": 11718, + "random": 11719, + "Ġexplicit": 11720, + "Ġarrived": 11721, + "Ġfly": 11722, + "Ġeconom": 11723, + "-mail": 11724, + "Ġlists": 11725, + "Ġarchitect": 11726, + "234": 11727, + "ĠPay": 11728, + "Ġds": 11729, + "ĠSol": 11730, + "Ġvehicles": 11731, + "Hz": 11732, + "-com": 11733, + "Ġking": 11734, + "_equal": 11735, + "ĠHelp": 11736, + "Ġabuse": 11737, + "480": 11738, + "169": 11739, + "--;Ċ": 11740, + "Ġextr": 11741, + "Ġchemical": 11742, + "ä¿": 11743, + "Ġorient": 11744, + "Ġbreath": 11745, + "ĠSpace": 11746, + "(element": 11747, + "wait": 11748, + "DED": 11749, + "igma": 11750, + "Ġentr": 11751, + "Ġsob": 11752, + "-name": 11753, + "Ġaffected": 11754, + "ika": 11755, + "Ġcoal": 11756, + "_work": 11757, + "Ġhundreds": 11758, + "Ġpolitics": 11759, + "subject": 11760, + "Ġconsumer": 11761, + "ANGE": 11762, + "Ġrepeated": 11763, + "Send": 11764, + "Ġ#[": 11765, + "Ġprotocol": 11766, + "Ġleads": 11767, + "useum": 11768, + "Every": 11769, + "808": 11770, + "174": 11771, + "Import": 11772, + "(count": 11773, + "Ġchallenges": 11774, + "Ġnovel": 11775, + "Ġdepart": 11776, + "bits": 11777, + ".Current": 11778, + "Ġ`${": 11779, + "oting": 11780, + "(\\": 11781, + "Ġcreative": 11782, + "Ġbuff": 11783, + "Ġintroduced": 11784, + "usic": 11785, + "modules": 11786, + "Are": 11787, + "-doc": 11788, + "language": 11789, + "_cache": 11790, + "Ġtod": 11791, + "?>": 11792, + "omething": 11793, + "Ġhun": 11794, + "åº": 11795, + "aters": 11796, + "Intent": 11797, + "Ġimplemented": 11798, + "ĠCase": 11799, + "Children": 11800, + "Ġnotification": 11801, + "Renderer": 11802, + "Wrapper": 11803, + "Objects": 11804, + "tl": 11805, + ".Contains": 11806, + "Plugin": 11807, + ".row": 11808, + "Ġforg": 11809, + "Ġpermit": 11810, + "Ġtargets": 11811, + "ĠIF": 11812, + "Ġtip": 11813, + "sex": 11814, + "Ġsupports": 11815, + "Ġfold": 11816, + "photo": 11817, + "},čĊ": 11818, + "Ġgoogle": 11819, + "$('#": 11820, + "Ġsharing": 11821, + "Ġgoods": 11822, + "vs": 11823, + "ĠDan": 11824, + "Rate": 11825, + "ĠMartin": 11826, + "Ġmanner": 11827, + "lie": 11828, + ".The": 11829, + "Internal": 11830, + "ĠCONTR": 11831, + "Mock": 11832, + "RIGHT": 11833, + "Ġ'{": 11834, + "Ġcontrols": 11835, + "Mat": 11836, + "Ġmand": 11837, + "Ġextended": 11838, + "Ok": 11839, + "Ġembed": 11840, + "Ġplanet": 11841, + "ĠNon": 11842, + "-ch": 11843, + ")\",": 11844, + "epar": 11845, + "Ġbelieved": 11846, + "ĠEnvironment": 11847, + "ĠFriend": 11848, + "-res": 11849, + "Ġhandling": 11850, + "nic": 11851, + "-level": 11852, + "scri": 11853, + "Xml": 11854, + "BE": 11855, + "ungen": 11856, + "Ġalter": 11857, + "[idx": 11858, + "Pop": 11859, + "cam": 11860, + "Ġ(((": 11861, + "Ġshipping": 11862, + "Ġbattery": 11863, + "iddleware": 11864, + "MC": 11865, + "Ġimpl": 11866, + "otation": 11867, + "ĠLab": 11868, + "