diff --git a/checkpoint-12208/config.json b/checkpoint-12208/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-12208/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-12208/generation_config.json b/checkpoint-12208/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-12208/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-12208/model-00001-of-00007.safetensors b/checkpoint-12208/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..914be5cd3e32206da4f141053eee11451a4b2f41 --- /dev/null +++ b/checkpoint-12208/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6e41dc86657c1d2149738fb30989a534d038c0963531fbe129454b5636019e9 +size 4886466168 diff --git a/checkpoint-12208/model-00002-of-00007.safetensors b/checkpoint-12208/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-12208/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-12208/model-00003-of-00007.safetensors b/checkpoint-12208/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-12208/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-12208/model-00004-of-00007.safetensors b/checkpoint-12208/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-12208/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-12208/model-00005-of-00007.safetensors b/checkpoint-12208/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-12208/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-12208/model-00006-of-00007.safetensors b/checkpoint-12208/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..96e2384dc79a4c09d2c23cfb45bc1ed0575cfe8d --- /dev/null +++ b/checkpoint-12208/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408c757804e15f394790e0cfec6017deb32b82842faab9eecf7ab9bcc32438c4 +size 4999813120 diff --git a/checkpoint-12208/model-00007-of-00007.safetensors b/checkpoint-12208/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8afab9edb93995d47ee4f0121b293f7e996b1aa --- /dev/null +++ b/checkpoint-12208/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ee6209ba80ffef498bffbf5f595cd96fb8285b879825313838ee31452ff8a19 +size 2571158184 diff --git a/checkpoint-12208/model.safetensors.index.json b/checkpoint-12208/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-12208/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-12208/optimizer.pt b/checkpoint-12208/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e4919ebd9b5248421099f739ad41e393e8791a8 --- /dev/null +++ b/checkpoint-12208/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54483e5c3c8e397bc48b5f28fd24bcdc890e169b837ca1fe9b352ca2db48f80c +size 15385036334 diff --git a/checkpoint-12208/rng_state.pth b/checkpoint-12208/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-12208/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-12208/scheduler.pt b/checkpoint-12208/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3b066b7a770b71a74f026fa108a814ac17f832 --- /dev/null +++ b/checkpoint-12208/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe644ed33a3c4139223f0857a985127f3e6fbaa8c89fa14b57671b49ca52c21 +size 1064 diff --git a/checkpoint-12208/trainer_state.json b/checkpoint-12208/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26a573156a093e7638afdb61ab0cac6c0ad5e8bc --- /dev/null +++ b/checkpoint-12208/trainer_state.json @@ -0,0 +1,2784 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08560019352599454, + "eval_steps": 500, + "global_step": 12208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.006624315386364e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12208/training_args.bin b/checkpoint-12208/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-12208/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-15260/config.json b/checkpoint-15260/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-15260/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-15260/generation_config.json b/checkpoint-15260/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-15260/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-15260/model-00001-of-00007.safetensors b/checkpoint-15260/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3cd642a89d4911197b0a576a793e88eae3912057 --- /dev/null +++ b/checkpoint-15260/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4cb026f0a5829683c10c1335673e1436a45abc9b1455d3a942d36da85b766d6 +size 4886466168 diff --git a/checkpoint-15260/model-00002-of-00007.safetensors b/checkpoint-15260/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-15260/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-15260/model-00003-of-00007.safetensors b/checkpoint-15260/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-15260/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-15260/model-00004-of-00007.safetensors b/checkpoint-15260/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-15260/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-15260/model-00005-of-00007.safetensors b/checkpoint-15260/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-15260/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-15260/model-00006-of-00007.safetensors b/checkpoint-15260/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c4c0d8cf5778a83677f6d800ad9e6752ee833331 --- /dev/null +++ b/checkpoint-15260/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17020e4642a219332982712a64e7d359e75890d10bd4f46cc75489e5dda3c81a +size 4999813120 diff --git a/checkpoint-15260/model-00007-of-00007.safetensors b/checkpoint-15260/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..28068ea59c7d3bd6314951c168e8bb6a586bb7e9 --- /dev/null +++ b/checkpoint-15260/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56927410b72fca2e6df4a7cf12fc42d5aeffb51d50e287009620cbb282635c2d +size 2571158184 diff --git a/checkpoint-15260/model.safetensors.index.json b/checkpoint-15260/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-15260/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-15260/optimizer.pt b/checkpoint-15260/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..179e380417c2530730be1c90c3cd160a659849e5 --- /dev/null +++ b/checkpoint-15260/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bfc58974f01b39e88901516dd69714dccbc66c4658216919664749de0901d2b +size 15385036334 diff --git a/checkpoint-15260/rng_state.pth b/checkpoint-15260/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-15260/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-15260/scheduler.pt b/checkpoint-15260/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f65fadce527ecb01fb7d7fd7ef901b9b082813 --- /dev/null +++ b/checkpoint-15260/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce335347efaebf038b8915bc0a8d2d587a4a1aa08ad0e015b4bc7cc4fba634e +size 1064 diff --git a/checkpoint-15260/trainer_state.json b/checkpoint-15260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..98b8d15013df04e02fdcb189554203eb7beff1d4 --- /dev/null +++ b/checkpoint-15260/trainer_state.json @@ -0,0 +1,3477 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.10700024190749317, + "eval_steps": 500, + "global_step": 15260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + }, + { + "epoch": 0.08564226439437232, + "grad_norm": 1.0365880727767944, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5954, + "step": 12214 + }, + { + "epoch": 0.08585963054765752, + "grad_norm": 1.0658811330795288, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5953, + "step": 12245 + }, + { + "epoch": 0.08607699670094274, + "grad_norm": 1.046661615371704, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5786, + "step": 12276 + }, + { + "epoch": 0.08629436285422795, + "grad_norm": 1.0220295190811157, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5879, + "step": 12307 + }, + { + "epoch": 0.08651172900751315, + "grad_norm": 1.0286898612976074, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5882, + "step": 12338 + }, + { + "epoch": 0.08672909516079837, + "grad_norm": 1.060483694076538, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5887, + "step": 12369 + }, + { + "epoch": 0.08694646131408357, + "grad_norm": 1.264788269996643, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5895, + "step": 12400 + }, + { + "epoch": 0.08716382746736878, + "grad_norm": 0.9751125574111938, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5846, + "step": 12431 + }, + { + "epoch": 0.087381193620654, + "grad_norm": 1.1198588609695435, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5878, + "step": 12462 + }, + { + "epoch": 0.0875985597739392, + "grad_norm": 1.0425771474838257, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5903, + "step": 12493 + }, + { + "epoch": 0.08781592592722441, + "grad_norm": 0.9820422530174255, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.5885, + "step": 12524 + }, + { + "epoch": 0.08803329208050961, + "grad_norm": 1.0234076976776123, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5967, + "step": 12555 + }, + { + "epoch": 0.08825065823379483, + "grad_norm": 0.9766228795051575, + "learning_rate": 3.409364314116074e-05, + "loss": 0.5923, + "step": 12586 + }, + { + "epoch": 0.08846802438708004, + "grad_norm": 0.9033060073852539, + "learning_rate": 3.401536249920559e-05, + "loss": 0.5892, + "step": 12617 + }, + { + "epoch": 0.08868539054036524, + "grad_norm": 1.1264934539794922, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5917, + "step": 12648 + }, + { + "epoch": 0.08890275669365046, + "grad_norm": 1.0052556991577148, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5885, + "step": 12679 + }, + { + "epoch": 0.08912012284693566, + "grad_norm": 1.0888159275054932, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5885, + "step": 12710 + }, + { + "epoch": 0.08933748900022087, + "grad_norm": 1.0458447933197021, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5923, + "step": 12741 + }, + { + "epoch": 0.08955485515350609, + "grad_norm": 1.055434226989746, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5889, + "step": 12772 + }, + { + "epoch": 0.08977222130679129, + "grad_norm": 1.000533103942871, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5833, + "step": 12803 + }, + { + "epoch": 0.0899895874600765, + "grad_norm": 1.0112768411636353, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5886, + "step": 12834 + }, + { + "epoch": 0.0902069536133617, + "grad_norm": 0.9883559942245483, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5886, + "step": 12865 + }, + { + "epoch": 0.09042431976664692, + "grad_norm": 1.0933095216751099, + "learning_rate": 3.330636493090868e-05, + "loss": 0.5873, + "step": 12896 + }, + { + "epoch": 0.09064168591993213, + "grad_norm": 1.1089582443237305, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5845, + "step": 12927 + }, + { + "epoch": 0.09085905207321733, + "grad_norm": 1.0311237573623657, + "learning_rate": 3.314775593945251e-05, + "loss": 0.588, + "step": 12958 + }, + { + "epoch": 0.09107641822650255, + "grad_norm": 1.0495847463607788, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5839, + "step": 12989 + }, + { + "epoch": 0.09129378437978775, + "grad_norm": 1.046921730041504, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5877, + "step": 13020 + }, + { + "epoch": 0.09151115053307296, + "grad_norm": 0.9927343130111694, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5898, + "step": 13051 + }, + { + "epoch": 0.09172851668635817, + "grad_norm": 0.9319870471954346, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5886, + "step": 13082 + }, + { + "epoch": 0.09194588283964338, + "grad_norm": 1.0408544540405273, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5946, + "step": 13113 + }, + { + "epoch": 0.09216324899292859, + "grad_norm": 1.0984251499176025, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5813, + "step": 13144 + }, + { + "epoch": 0.09238061514621379, + "grad_norm": 1.0376449823379517, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5855, + "step": 13175 + }, + { + "epoch": 0.092597981299499, + "grad_norm": 0.9627104997634888, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5846, + "step": 13206 + }, + { + "epoch": 0.09281534745278422, + "grad_norm": 1.2040653228759766, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5872, + "step": 13237 + }, + { + "epoch": 0.09303271360606942, + "grad_norm": 1.0091586112976074, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5911, + "step": 13268 + }, + { + "epoch": 0.09325007975935463, + "grad_norm": 1.0941787958145142, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5844, + "step": 13299 + }, + { + "epoch": 0.09346744591263983, + "grad_norm": 0.9720754027366638, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5898, + "step": 13330 + }, + { + "epoch": 0.09368481206592505, + "grad_norm": 0.8847430944442749, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5804, + "step": 13361 + }, + { + "epoch": 0.09390217821921026, + "grad_norm": 1.0070865154266357, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5916, + "step": 13392 + }, + { + "epoch": 0.09411954437249546, + "grad_norm": 0.9538979530334473, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5984, + "step": 13423 + }, + { + "epoch": 0.09433691052578068, + "grad_norm": 1.0414469242095947, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5828, + "step": 13454 + }, + { + "epoch": 0.0945542766790659, + "grad_norm": 1.0353257656097412, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5835, + "step": 13485 + }, + { + "epoch": 0.0947716428323511, + "grad_norm": 1.0454679727554321, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5909, + "step": 13516 + }, + { + "epoch": 0.09498900898563631, + "grad_norm": 1.246871829032898, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5829, + "step": 13547 + }, + { + "epoch": 0.09520637513892151, + "grad_norm": 1.0331645011901855, + "learning_rate": 3.154262717052985e-05, + "loss": 0.5919, + "step": 13578 + }, + { + "epoch": 0.09542374129220672, + "grad_norm": 0.8984120488166809, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5853, + "step": 13609 + }, + { + "epoch": 0.09564110744549194, + "grad_norm": 1.0582929849624634, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5828, + "step": 13640 + }, + { + "epoch": 0.09585847359877714, + "grad_norm": 1.151041865348816, + "learning_rate": 3.129913267924946e-05, + "loss": 0.582, + "step": 13671 + }, + { + "epoch": 0.09607583975206235, + "grad_norm": 0.9678122401237488, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5865, + "step": 13702 + }, + { + "epoch": 0.09629320590534755, + "grad_norm": 1.033832311630249, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5797, + "step": 13733 + }, + { + "epoch": 0.09651057205863277, + "grad_norm": 0.8917058706283569, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5829, + "step": 13764 + }, + { + "epoch": 0.09672793821191798, + "grad_norm": 0.9396641850471497, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5825, + "step": 13795 + }, + { + "epoch": 0.09694530436520318, + "grad_norm": 1.0871998071670532, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.5851, + "step": 13826 + }, + { + "epoch": 0.0971626705184884, + "grad_norm": 1.1403570175170898, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.5807, + "step": 13857 + }, + { + "epoch": 0.0973800366717736, + "grad_norm": 1.0246673822402954, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5795, + "step": 13888 + }, + { + "epoch": 0.09759740282505881, + "grad_norm": 0.9431213140487671, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5825, + "step": 13919 + }, + { + "epoch": 0.09781476897834403, + "grad_norm": 1.1028188467025757, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5885, + "step": 13950 + }, + { + "epoch": 0.09803213513162923, + "grad_norm": 1.125482201576233, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5849, + "step": 13981 + }, + { + "epoch": 0.09824950128491444, + "grad_norm": 0.9831985831260681, + "learning_rate": 3.040103481317539e-05, + "loss": 0.586, + "step": 14012 + }, + { + "epoch": 0.09846686743819964, + "grad_norm": 1.0038244724273682, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5894, + "step": 14043 + }, + { + "epoch": 0.09868423359148486, + "grad_norm": 0.8766770362854004, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5773, + "step": 14074 + }, + { + "epoch": 0.09890159974477007, + "grad_norm": 0.9358529448509216, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.5887, + "step": 14105 + }, + { + "epoch": 0.09911896589805527, + "grad_norm": 0.8988749384880066, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5815, + "step": 14136 + }, + { + "epoch": 0.09933633205134049, + "grad_norm": 1.08226478099823, + "learning_rate": 2.999029669712431e-05, + "loss": 0.5944, + "step": 14167 + }, + { + "epoch": 0.09955369820462569, + "grad_norm": 1.046980619430542, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5824, + "step": 14198 + }, + { + "epoch": 0.0997710643579109, + "grad_norm": 0.96205073595047, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5847, + "step": 14229 + }, + { + "epoch": 0.09998843051119612, + "grad_norm": 0.9914370775222778, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5824, + "step": 14260 + }, + { + "epoch": 0.10020579666448132, + "grad_norm": 0.922265887260437, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5907, + "step": 14291 + }, + { + "epoch": 0.10042316281776653, + "grad_norm": 0.971159815788269, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5806, + "step": 14322 + }, + { + "epoch": 0.10064052897105173, + "grad_norm": 1.1327799558639526, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5812, + "step": 14353 + }, + { + "epoch": 0.10085789512433695, + "grad_norm": 1.1632689237594604, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5813, + "step": 14384 + }, + { + "epoch": 0.10107526127762216, + "grad_norm": 0.9936057329177856, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.5803, + "step": 14415 + }, + { + "epoch": 0.10129262743090736, + "grad_norm": 1.1123740673065186, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5767, + "step": 14446 + }, + { + "epoch": 0.10150999358419258, + "grad_norm": 0.908643364906311, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5797, + "step": 14477 + }, + { + "epoch": 0.10172735973747778, + "grad_norm": 0.8761305809020996, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.589, + "step": 14508 + }, + { + "epoch": 0.10194472589076299, + "grad_norm": 1.0904477834701538, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.5796, + "step": 14539 + }, + { + "epoch": 0.1021620920440482, + "grad_norm": 1.060160756111145, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5837, + "step": 14570 + }, + { + "epoch": 0.1023794581973334, + "grad_norm": 1.0514498949050903, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5792, + "step": 14601 + }, + { + "epoch": 0.10259682435061862, + "grad_norm": 1.0152157545089722, + "learning_rate": 2.875010077160754e-05, + "loss": 0.574, + "step": 14632 + }, + { + "epoch": 0.10281419050390382, + "grad_norm": 0.974791944026947, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5819, + "step": 14663 + }, + { + "epoch": 0.10303155665718904, + "grad_norm": 0.9271855354309082, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5798, + "step": 14694 + }, + { + "epoch": 0.10324892281047425, + "grad_norm": 0.9933458566665649, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5796, + "step": 14725 + }, + { + "epoch": 0.10346628896375945, + "grad_norm": 0.9245687127113342, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5771, + "step": 14756 + }, + { + "epoch": 0.10368365511704467, + "grad_norm": 0.9483000040054321, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5825, + "step": 14787 + }, + { + "epoch": 0.10390102127032987, + "grad_norm": 0.9054533839225769, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5817, + "step": 14818 + }, + { + "epoch": 0.10411838742361508, + "grad_norm": 1.0200084447860718, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5788, + "step": 14849 + }, + { + "epoch": 0.1043357535769003, + "grad_norm": 0.9364084005355835, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5796, + "step": 14880 + }, + { + "epoch": 0.1045531197301855, + "grad_norm": 0.9658653736114502, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5825, + "step": 14911 + }, + { + "epoch": 0.10477048588347071, + "grad_norm": 0.8612831234931946, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5842, + "step": 14942 + }, + { + "epoch": 0.10498785203675591, + "grad_norm": 0.9673991799354553, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5776, + "step": 14973 + }, + { + "epoch": 0.10520521819004112, + "grad_norm": 1.092144250869751, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5785, + "step": 15004 + }, + { + "epoch": 0.10542258434332634, + "grad_norm": 1.0162559747695923, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5821, + "step": 15035 + }, + { + "epoch": 0.10563995049661154, + "grad_norm": 0.9578939080238342, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5781, + "step": 15066 + }, + { + "epoch": 0.10585731664989675, + "grad_norm": 0.9934547543525696, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5826, + "step": 15097 + }, + { + "epoch": 0.10607468280318195, + "grad_norm": 0.9542795419692993, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.583, + "step": 15128 + }, + { + "epoch": 0.10629204895646717, + "grad_norm": 0.9436337947845459, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5832, + "step": 15159 + }, + { + "epoch": 0.10650941510975238, + "grad_norm": 0.9200606942176819, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5766, + "step": 15190 + }, + { + "epoch": 0.10672678126303758, + "grad_norm": 1.0009740591049194, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5835, + "step": 15221 + }, + { + "epoch": 0.1069441474163228, + "grad_norm": 0.9597145915031433, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5728, + "step": 15252 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1258280394232955e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15260/training_args.bin b/checkpoint-15260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-15260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-18312/config.json b/checkpoint-18312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-18312/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-18312/generation_config.json b/checkpoint-18312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-18312/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-18312/model-00001-of-00007.safetensors b/checkpoint-18312/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ea4dba15e55791a37617330fef5639cc3ed8703 --- /dev/null +++ b/checkpoint-18312/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8586746032ca2a712be9adeb302f11125101dd345d36bea9bb4d92265761297b +size 4886466168 diff --git a/checkpoint-18312/model-00002-of-00007.safetensors b/checkpoint-18312/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-18312/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-18312/model-00003-of-00007.safetensors b/checkpoint-18312/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-18312/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-18312/model-00004-of-00007.safetensors b/checkpoint-18312/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-18312/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-18312/model-00005-of-00007.safetensors b/checkpoint-18312/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-18312/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-18312/model-00006-of-00007.safetensors b/checkpoint-18312/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ed261537337f024d0dd2f455915ea52ee638f91a --- /dev/null +++ b/checkpoint-18312/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9cbccce82be060349a3666ef22a62bb3f1fb4ee73d1523e8d20bfd2ee6d8e85 +size 4999813120 diff --git a/checkpoint-18312/model-00007-of-00007.safetensors b/checkpoint-18312/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3fae4109c9677a89e68ab2c4275e6961763d1452 --- /dev/null +++ b/checkpoint-18312/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00be9546497a5a6726ba09faaf4c510b3b053bda4ace055d3bf3e8c28f945cf +size 2571158184 diff --git a/checkpoint-18312/model.safetensors.index.json b/checkpoint-18312/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-18312/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-18312/optimizer.pt b/checkpoint-18312/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2eefad1e88eeded123f92d6227cea01ba1b8e46e --- /dev/null +++ b/checkpoint-18312/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b10c84d9a3ee7718da1e24de45ffdd81fdaa663bce8438623561ac18149314 +size 15385036334 diff --git a/checkpoint-18312/rng_state.pth b/checkpoint-18312/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-18312/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-18312/scheduler.pt b/checkpoint-18312/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..59a8b46d1ac64fc3cd4c673b6051786fee3ed26d --- /dev/null +++ b/checkpoint-18312/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e65c3d6f29e706fd941a38280ce5628189a6998eac6d29abbeab00ad838d00 +size 1064 diff --git a/checkpoint-18312/trainer_state.json b/checkpoint-18312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fbadfb7ffd50eb342b320ce814eba2413214850a --- /dev/null +++ b/checkpoint-18312/trainer_state.json @@ -0,0 +1,4163 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1284002902889918, + "eval_steps": 500, + "global_step": 18312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + }, + { + "epoch": 0.08564226439437232, + "grad_norm": 1.0365880727767944, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5954, + "step": 12214 + }, + { + "epoch": 0.08585963054765752, + "grad_norm": 1.0658811330795288, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5953, + "step": 12245 + }, + { + "epoch": 0.08607699670094274, + "grad_norm": 1.046661615371704, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5786, + "step": 12276 + }, + { + "epoch": 0.08629436285422795, + "grad_norm": 1.0220295190811157, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5879, + "step": 12307 + }, + { + "epoch": 0.08651172900751315, + "grad_norm": 1.0286898612976074, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5882, + "step": 12338 + }, + { + "epoch": 0.08672909516079837, + "grad_norm": 1.060483694076538, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5887, + "step": 12369 + }, + { + "epoch": 0.08694646131408357, + "grad_norm": 1.264788269996643, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5895, + "step": 12400 + }, + { + "epoch": 0.08716382746736878, + "grad_norm": 0.9751125574111938, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5846, + "step": 12431 + }, + { + "epoch": 0.087381193620654, + "grad_norm": 1.1198588609695435, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5878, + "step": 12462 + }, + { + "epoch": 0.0875985597739392, + "grad_norm": 1.0425771474838257, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5903, + "step": 12493 + }, + { + "epoch": 0.08781592592722441, + "grad_norm": 0.9820422530174255, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.5885, + "step": 12524 + }, + { + "epoch": 0.08803329208050961, + "grad_norm": 1.0234076976776123, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5967, + "step": 12555 + }, + { + "epoch": 0.08825065823379483, + "grad_norm": 0.9766228795051575, + "learning_rate": 3.409364314116074e-05, + "loss": 0.5923, + "step": 12586 + }, + { + "epoch": 0.08846802438708004, + "grad_norm": 0.9033060073852539, + "learning_rate": 3.401536249920559e-05, + "loss": 0.5892, + "step": 12617 + }, + { + "epoch": 0.08868539054036524, + "grad_norm": 1.1264934539794922, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5917, + "step": 12648 + }, + { + "epoch": 0.08890275669365046, + "grad_norm": 1.0052556991577148, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5885, + "step": 12679 + }, + { + "epoch": 0.08912012284693566, + "grad_norm": 1.0888159275054932, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5885, + "step": 12710 + }, + { + "epoch": 0.08933748900022087, + "grad_norm": 1.0458447933197021, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5923, + "step": 12741 + }, + { + "epoch": 0.08955485515350609, + "grad_norm": 1.055434226989746, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5889, + "step": 12772 + }, + { + "epoch": 0.08977222130679129, + "grad_norm": 1.000533103942871, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5833, + "step": 12803 + }, + { + "epoch": 0.0899895874600765, + "grad_norm": 1.0112768411636353, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5886, + "step": 12834 + }, + { + "epoch": 0.0902069536133617, + "grad_norm": 0.9883559942245483, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5886, + "step": 12865 + }, + { + "epoch": 0.09042431976664692, + "grad_norm": 1.0933095216751099, + "learning_rate": 3.330636493090868e-05, + "loss": 0.5873, + "step": 12896 + }, + { + "epoch": 0.09064168591993213, + "grad_norm": 1.1089582443237305, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5845, + "step": 12927 + }, + { + "epoch": 0.09085905207321733, + "grad_norm": 1.0311237573623657, + "learning_rate": 3.314775593945251e-05, + "loss": 0.588, + "step": 12958 + }, + { + "epoch": 0.09107641822650255, + "grad_norm": 1.0495847463607788, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5839, + "step": 12989 + }, + { + "epoch": 0.09129378437978775, + "grad_norm": 1.046921730041504, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5877, + "step": 13020 + }, + { + "epoch": 0.09151115053307296, + "grad_norm": 0.9927343130111694, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5898, + "step": 13051 + }, + { + "epoch": 0.09172851668635817, + "grad_norm": 0.9319870471954346, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5886, + "step": 13082 + }, + { + "epoch": 0.09194588283964338, + "grad_norm": 1.0408544540405273, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5946, + "step": 13113 + }, + { + "epoch": 0.09216324899292859, + "grad_norm": 1.0984251499176025, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5813, + "step": 13144 + }, + { + "epoch": 0.09238061514621379, + "grad_norm": 1.0376449823379517, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5855, + "step": 13175 + }, + { + "epoch": 0.092597981299499, + "grad_norm": 0.9627104997634888, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5846, + "step": 13206 + }, + { + "epoch": 0.09281534745278422, + "grad_norm": 1.2040653228759766, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5872, + "step": 13237 + }, + { + "epoch": 0.09303271360606942, + "grad_norm": 1.0091586112976074, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5911, + "step": 13268 + }, + { + "epoch": 0.09325007975935463, + "grad_norm": 1.0941787958145142, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5844, + "step": 13299 + }, + { + "epoch": 0.09346744591263983, + "grad_norm": 0.9720754027366638, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5898, + "step": 13330 + }, + { + "epoch": 0.09368481206592505, + "grad_norm": 0.8847430944442749, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5804, + "step": 13361 + }, + { + "epoch": 0.09390217821921026, + "grad_norm": 1.0070865154266357, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5916, + "step": 13392 + }, + { + "epoch": 0.09411954437249546, + "grad_norm": 0.9538979530334473, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5984, + "step": 13423 + }, + { + "epoch": 0.09433691052578068, + "grad_norm": 1.0414469242095947, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5828, + "step": 13454 + }, + { + "epoch": 0.0945542766790659, + "grad_norm": 1.0353257656097412, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5835, + "step": 13485 + }, + { + "epoch": 0.0947716428323511, + "grad_norm": 1.0454679727554321, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5909, + "step": 13516 + }, + { + "epoch": 0.09498900898563631, + "grad_norm": 1.246871829032898, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5829, + "step": 13547 + }, + { + "epoch": 0.09520637513892151, + "grad_norm": 1.0331645011901855, + "learning_rate": 3.154262717052985e-05, + "loss": 0.5919, + "step": 13578 + }, + { + "epoch": 0.09542374129220672, + "grad_norm": 0.8984120488166809, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5853, + "step": 13609 + }, + { + "epoch": 0.09564110744549194, + "grad_norm": 1.0582929849624634, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5828, + "step": 13640 + }, + { + "epoch": 0.09585847359877714, + "grad_norm": 1.151041865348816, + "learning_rate": 3.129913267924946e-05, + "loss": 0.582, + "step": 13671 + }, + { + "epoch": 0.09607583975206235, + "grad_norm": 0.9678122401237488, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5865, + "step": 13702 + }, + { + "epoch": 0.09629320590534755, + "grad_norm": 1.033832311630249, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5797, + "step": 13733 + }, + { + "epoch": 0.09651057205863277, + "grad_norm": 0.8917058706283569, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5829, + "step": 13764 + }, + { + "epoch": 0.09672793821191798, + "grad_norm": 0.9396641850471497, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5825, + "step": 13795 + }, + { + "epoch": 0.09694530436520318, + "grad_norm": 1.0871998071670532, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.5851, + "step": 13826 + }, + { + "epoch": 0.0971626705184884, + "grad_norm": 1.1403570175170898, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.5807, + "step": 13857 + }, + { + "epoch": 0.0973800366717736, + "grad_norm": 1.0246673822402954, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5795, + "step": 13888 + }, + { + "epoch": 0.09759740282505881, + "grad_norm": 0.9431213140487671, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5825, + "step": 13919 + }, + { + "epoch": 0.09781476897834403, + "grad_norm": 1.1028188467025757, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5885, + "step": 13950 + }, + { + "epoch": 0.09803213513162923, + "grad_norm": 1.125482201576233, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5849, + "step": 13981 + }, + { + "epoch": 0.09824950128491444, + "grad_norm": 0.9831985831260681, + "learning_rate": 3.040103481317539e-05, + "loss": 0.586, + "step": 14012 + }, + { + "epoch": 0.09846686743819964, + "grad_norm": 1.0038244724273682, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5894, + "step": 14043 + }, + { + "epoch": 0.09868423359148486, + "grad_norm": 0.8766770362854004, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5773, + "step": 14074 + }, + { + "epoch": 0.09890159974477007, + "grad_norm": 0.9358529448509216, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.5887, + "step": 14105 + }, + { + "epoch": 0.09911896589805527, + "grad_norm": 0.8988749384880066, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5815, + "step": 14136 + }, + { + "epoch": 0.09933633205134049, + "grad_norm": 1.08226478099823, + "learning_rate": 2.999029669712431e-05, + "loss": 0.5944, + "step": 14167 + }, + { + "epoch": 0.09955369820462569, + "grad_norm": 1.046980619430542, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5824, + "step": 14198 + }, + { + "epoch": 0.0997710643579109, + "grad_norm": 0.96205073595047, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5847, + "step": 14229 + }, + { + "epoch": 0.09998843051119612, + "grad_norm": 0.9914370775222778, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5824, + "step": 14260 + }, + { + "epoch": 0.10020579666448132, + "grad_norm": 0.922265887260437, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5907, + "step": 14291 + }, + { + "epoch": 0.10042316281776653, + "grad_norm": 0.971159815788269, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5806, + "step": 14322 + }, + { + "epoch": 0.10064052897105173, + "grad_norm": 1.1327799558639526, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5812, + "step": 14353 + }, + { + "epoch": 0.10085789512433695, + "grad_norm": 1.1632689237594604, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5813, + "step": 14384 + }, + { + "epoch": 0.10107526127762216, + "grad_norm": 0.9936057329177856, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.5803, + "step": 14415 + }, + { + "epoch": 0.10129262743090736, + "grad_norm": 1.1123740673065186, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5767, + "step": 14446 + }, + { + "epoch": 0.10150999358419258, + "grad_norm": 0.908643364906311, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5797, + "step": 14477 + }, + { + "epoch": 0.10172735973747778, + "grad_norm": 0.8761305809020996, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.589, + "step": 14508 + }, + { + "epoch": 0.10194472589076299, + "grad_norm": 1.0904477834701538, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.5796, + "step": 14539 + }, + { + "epoch": 0.1021620920440482, + "grad_norm": 1.060160756111145, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5837, + "step": 14570 + }, + { + "epoch": 0.1023794581973334, + "grad_norm": 1.0514498949050903, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5792, + "step": 14601 + }, + { + "epoch": 0.10259682435061862, + "grad_norm": 1.0152157545089722, + "learning_rate": 2.875010077160754e-05, + "loss": 0.574, + "step": 14632 + }, + { + "epoch": 0.10281419050390382, + "grad_norm": 0.974791944026947, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5819, + "step": 14663 + }, + { + "epoch": 0.10303155665718904, + "grad_norm": 0.9271855354309082, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5798, + "step": 14694 + }, + { + "epoch": 0.10324892281047425, + "grad_norm": 0.9933458566665649, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5796, + "step": 14725 + }, + { + "epoch": 0.10346628896375945, + "grad_norm": 0.9245687127113342, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5771, + "step": 14756 + }, + { + "epoch": 0.10368365511704467, + "grad_norm": 0.9483000040054321, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5825, + "step": 14787 + }, + { + "epoch": 0.10390102127032987, + "grad_norm": 0.9054533839225769, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5817, + "step": 14818 + }, + { + "epoch": 0.10411838742361508, + "grad_norm": 1.0200084447860718, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5788, + "step": 14849 + }, + { + "epoch": 0.1043357535769003, + "grad_norm": 0.9364084005355835, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5796, + "step": 14880 + }, + { + "epoch": 0.1045531197301855, + "grad_norm": 0.9658653736114502, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5825, + "step": 14911 + }, + { + "epoch": 0.10477048588347071, + "grad_norm": 0.8612831234931946, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5842, + "step": 14942 + }, + { + "epoch": 0.10498785203675591, + "grad_norm": 0.9673991799354553, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5776, + "step": 14973 + }, + { + "epoch": 0.10520521819004112, + "grad_norm": 1.092144250869751, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5785, + "step": 15004 + }, + { + "epoch": 0.10542258434332634, + "grad_norm": 1.0162559747695923, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5821, + "step": 15035 + }, + { + "epoch": 0.10563995049661154, + "grad_norm": 0.9578939080238342, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5781, + "step": 15066 + }, + { + "epoch": 0.10585731664989675, + "grad_norm": 0.9934547543525696, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5826, + "step": 15097 + }, + { + "epoch": 0.10607468280318195, + "grad_norm": 0.9542795419692993, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.583, + "step": 15128 + }, + { + "epoch": 0.10629204895646717, + "grad_norm": 0.9436337947845459, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5832, + "step": 15159 + }, + { + "epoch": 0.10650941510975238, + "grad_norm": 0.9200606942176819, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5766, + "step": 15190 + }, + { + "epoch": 0.10672678126303758, + "grad_norm": 1.0009740591049194, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5835, + "step": 15221 + }, + { + "epoch": 0.1069441474163228, + "grad_norm": 0.9597145915031433, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5728, + "step": 15252 + }, + { + "epoch": 0.107161513569608, + "grad_norm": 1.248867154121399, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5822, + "step": 15283 + }, + { + "epoch": 0.10737887972289321, + "grad_norm": 0.8899833559989929, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5843, + "step": 15314 + }, + { + "epoch": 0.10759624587617843, + "grad_norm": 1.0085718631744385, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5811, + "step": 15345 + }, + { + "epoch": 0.10781361202946363, + "grad_norm": 0.9277573227882385, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.5774, + "step": 15376 + }, + { + "epoch": 0.10803097818274884, + "grad_norm": 1.199010968208313, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5735, + "step": 15407 + }, + { + "epoch": 0.10824834433603404, + "grad_norm": 0.9361419081687927, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5834, + "step": 15438 + }, + { + "epoch": 0.10846571048931926, + "grad_norm": 1.05440092086792, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5721, + "step": 15469 + }, + { + "epoch": 0.10868307664260447, + "grad_norm": 1.0973948240280151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5838, + "step": 15500 + }, + { + "epoch": 0.10890044279588967, + "grad_norm": 0.9417588710784912, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5786, + "step": 15531 + }, + { + "epoch": 0.10911780894917489, + "grad_norm": 0.8763983845710754, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5766, + "step": 15562 + }, + { + "epoch": 0.10933517510246009, + "grad_norm": 1.0105509757995605, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.5896, + "step": 15593 + }, + { + "epoch": 0.1095525412557453, + "grad_norm": 0.9172413349151611, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5788, + "step": 15624 + }, + { + "epoch": 0.10976990740903052, + "grad_norm": 0.9640869498252869, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5791, + "step": 15655 + }, + { + "epoch": 0.10998727356231572, + "grad_norm": 1.0987275838851929, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.581, + "step": 15686 + }, + { + "epoch": 0.11020463971560093, + "grad_norm": 1.0418893098831177, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.569, + "step": 15717 + }, + { + "epoch": 0.11042200586888613, + "grad_norm": 1.0216400623321533, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5834, + "step": 15748 + }, + { + "epoch": 0.11063937202217135, + "grad_norm": 1.0211747884750366, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5733, + "step": 15779 + }, + { + "epoch": 0.11085673817545656, + "grad_norm": 0.9743130207061768, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5789, + "step": 15810 + }, + { + "epoch": 0.11107410432874176, + "grad_norm": 1.1765626668930054, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.57, + "step": 15841 + }, + { + "epoch": 0.11129147048202698, + "grad_norm": 0.9354963898658752, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5894, + "step": 15872 + }, + { + "epoch": 0.11150883663531218, + "grad_norm": 0.8743797540664673, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5716, + "step": 15903 + }, + { + "epoch": 0.11172620278859739, + "grad_norm": 1.1076644659042358, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5787, + "step": 15934 + }, + { + "epoch": 0.1119435689418826, + "grad_norm": 0.9156807065010071, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5764, + "step": 15965 + }, + { + "epoch": 0.11216093509516781, + "grad_norm": 1.0239089727401733, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5748, + "step": 15996 + }, + { + "epoch": 0.11237830124845302, + "grad_norm": 1.5095417499542236, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5815, + "step": 16027 + }, + { + "epoch": 0.11259566740173822, + "grad_norm": 0.9298838973045349, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5742, + "step": 16058 + }, + { + "epoch": 0.11281303355502344, + "grad_norm": 0.8956617712974548, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5856, + "step": 16089 + }, + { + "epoch": 0.11303039970830865, + "grad_norm": 0.950095534324646, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5786, + "step": 16120 + }, + { + "epoch": 0.11324776586159385, + "grad_norm": 1.0230988264083862, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5771, + "step": 16151 + }, + { + "epoch": 0.11346513201487907, + "grad_norm": 1.1018470525741577, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.57, + "step": 16182 + }, + { + "epoch": 0.11368249816816427, + "grad_norm": 0.9700168371200562, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.5689, + "step": 16213 + }, + { + "epoch": 0.11389986432144948, + "grad_norm": 0.9069929718971252, + "learning_rate": 2.439728136286796e-05, + "loss": 0.5719, + "step": 16244 + }, + { + "epoch": 0.1141172304747347, + "grad_norm": 0.9254815578460693, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5704, + "step": 16275 + }, + { + "epoch": 0.1143345966280199, + "grad_norm": 0.9150753021240234, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5754, + "step": 16306 + }, + { + "epoch": 0.11455196278130511, + "grad_norm": 1.003201961517334, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5746, + "step": 16337 + }, + { + "epoch": 0.11476932893459031, + "grad_norm": 1.1016685962677002, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5763, + "step": 16368 + }, + { + "epoch": 0.11498669508787553, + "grad_norm": 1.0079994201660156, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5721, + "step": 16399 + }, + { + "epoch": 0.11520406124116074, + "grad_norm": 0.989470899105072, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5795, + "step": 16430 + }, + { + "epoch": 0.11542142739444594, + "grad_norm": 1.039035439491272, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5737, + "step": 16461 + }, + { + "epoch": 0.11563879354773116, + "grad_norm": 0.8659546971321106, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5711, + "step": 16492 + }, + { + "epoch": 0.11585615970101636, + "grad_norm": 0.9558688998222351, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5763, + "step": 16523 + }, + { + "epoch": 0.11607352585430157, + "grad_norm": 1.0017194747924805, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5772, + "step": 16554 + }, + { + "epoch": 0.11629089200758679, + "grad_norm": 1.0045812129974365, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5744, + "step": 16585 + }, + { + "epoch": 0.11650825816087199, + "grad_norm": 0.8719637393951416, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5681, + "step": 16616 + }, + { + "epoch": 0.1167256243141572, + "grad_norm": 0.9029743075370789, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5807, + "step": 16647 + }, + { + "epoch": 0.1169429904674424, + "grad_norm": 0.9439691305160522, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5751, + "step": 16678 + }, + { + "epoch": 0.11716035662072762, + "grad_norm": 0.900688648223877, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5772, + "step": 16709 + }, + { + "epoch": 0.11737772277401283, + "grad_norm": 0.8884438872337341, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5758, + "step": 16740 + }, + { + "epoch": 0.11759508892729803, + "grad_norm": 0.9252585172653198, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5705, + "step": 16771 + }, + { + "epoch": 0.11781245508058324, + "grad_norm": 0.9447957873344421, + "learning_rate": 2.288805948824212e-05, + "loss": 0.566, + "step": 16802 + }, + { + "epoch": 0.11802982123386845, + "grad_norm": 0.9666566252708435, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5745, + "step": 16833 + }, + { + "epoch": 0.11824718738715366, + "grad_norm": 0.9459251761436462, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.5775, + "step": 16864 + }, + { + "epoch": 0.11846455354043887, + "grad_norm": 0.8863123059272766, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5716, + "step": 16895 + }, + { + "epoch": 0.11868191969372408, + "grad_norm": 0.9847676753997803, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5763, + "step": 16926 + }, + { + "epoch": 0.11889928584700929, + "grad_norm": 1.1111658811569214, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5752, + "step": 16957 + }, + { + "epoch": 0.11911665200029449, + "grad_norm": 1.0046541690826416, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.574, + "step": 16988 + }, + { + "epoch": 0.1193340181535797, + "grad_norm": 0.8580814599990845, + "learning_rate": 2.230292185905114e-05, + "loss": 0.572, + "step": 17019 + }, + { + "epoch": 0.11955138430686492, + "grad_norm": 0.9188304543495178, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.571, + "step": 17050 + }, + { + "epoch": 0.11976875046015012, + "grad_norm": 0.9079185724258423, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5792, + "step": 17081 + }, + { + "epoch": 0.11998611661343533, + "grad_norm": 0.9194979071617126, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5753, + "step": 17112 + }, + { + "epoch": 0.12020348276672053, + "grad_norm": 0.8398452997207642, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5653, + "step": 17143 + }, + { + "epoch": 0.12042084892000575, + "grad_norm": 0.9888772368431091, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5747, + "step": 17174 + }, + { + "epoch": 0.12063821507329096, + "grad_norm": 0.9137700796127319, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5728, + "step": 17205 + }, + { + "epoch": 0.12085558122657616, + "grad_norm": 1.058064579963684, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5717, + "step": 17236 + }, + { + "epoch": 0.12107294737986138, + "grad_norm": 0.9835705757141113, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.5725, + "step": 17267 + }, + { + "epoch": 0.12129031353314658, + "grad_norm": 0.918863832950592, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5676, + "step": 17298 + }, + { + "epoch": 0.1215076796864318, + "grad_norm": 0.9384900331497192, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5768, + "step": 17329 + }, + { + "epoch": 0.12172504583971701, + "grad_norm": 1.060088038444519, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5642, + "step": 17360 + }, + { + "epoch": 0.12194241199300221, + "grad_norm": 0.999266505241394, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5669, + "step": 17391 + }, + { + "epoch": 0.12215977814628742, + "grad_norm": 0.8633130788803101, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5661, + "step": 17422 + }, + { + "epoch": 0.12237714429957262, + "grad_norm": 0.9396159052848816, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5743, + "step": 17453 + }, + { + "epoch": 0.12259451045285784, + "grad_norm": 0.9990928173065186, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5707, + "step": 17484 + }, + { + "epoch": 0.12281187660614305, + "grad_norm": 0.9732767939567566, + "learning_rate": 2.097158366805287e-05, + "loss": 0.571, + "step": 17515 + }, + { + "epoch": 0.12302924275942825, + "grad_norm": 20.362672805786133, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5586, + "step": 17546 + }, + { + "epoch": 0.12324660891271347, + "grad_norm": 0.976889431476593, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5693, + "step": 17577 + }, + { + "epoch": 0.12346397506599867, + "grad_norm": 0.907172679901123, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5714, + "step": 17608 + }, + { + "epoch": 0.12368134121928388, + "grad_norm": 0.8816654086112976, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5669, + "step": 17639 + }, + { + "epoch": 0.1238987073725691, + "grad_norm": 0.9616197943687439, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.5739, + "step": 17670 + }, + { + "epoch": 0.1241160735258543, + "grad_norm": 0.9188937544822693, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5689, + "step": 17701 + }, + { + "epoch": 0.12433343967913951, + "grad_norm": 0.9845620393753052, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5716, + "step": 17732 + }, + { + "epoch": 0.12455080583242471, + "grad_norm": 0.8922098278999329, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5648, + "step": 17763 + }, + { + "epoch": 0.12476817198570993, + "grad_norm": 0.9715010523796082, + "learning_rate": 2.022757379528727e-05, + "loss": 0.5664, + "step": 17794 + }, + { + "epoch": 0.12498553813899514, + "grad_norm": 1.0769156217575073, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5689, + "step": 17825 + }, + { + "epoch": 0.12520290429228034, + "grad_norm": 0.9304386973381042, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5772, + "step": 17856 + }, + { + "epoch": 0.12542027044556556, + "grad_norm": 1.0523558855056763, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5686, + "step": 17887 + }, + { + "epoch": 0.12563763659885077, + "grad_norm": 1.029064655303955, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5629, + "step": 17918 + }, + { + "epoch": 0.12585500275213599, + "grad_norm": 1.0367600917816162, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5682, + "step": 17949 + }, + { + "epoch": 0.12607236890542117, + "grad_norm": 1.047844648361206, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.571, + "step": 17980 + }, + { + "epoch": 0.1262897350587064, + "grad_norm": 0.9374393820762634, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5731, + "step": 18011 + }, + { + "epoch": 0.1265071012119916, + "grad_norm": 1.0163381099700928, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.5769, + "step": 18042 + }, + { + "epoch": 0.12672446736527682, + "grad_norm": 0.9243590235710144, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5697, + "step": 18073 + }, + { + "epoch": 0.12694183351856203, + "grad_norm": 1.0359089374542236, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5639, + "step": 18104 + }, + { + "epoch": 0.12715919967184722, + "grad_norm": 0.841151773929596, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5792, + "step": 18135 + }, + { + "epoch": 0.12737656582513243, + "grad_norm": 1.0070539712905884, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5669, + "step": 18166 + }, + { + "epoch": 0.12759393197841765, + "grad_norm": 0.9453309178352356, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5683, + "step": 18197 + }, + { + "epoch": 0.12781129813170286, + "grad_norm": 0.9628680348396301, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5711, + "step": 18228 + }, + { + "epoch": 0.12802866428498808, + "grad_norm": 0.9396767616271973, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5709, + "step": 18259 + }, + { + "epoch": 0.12824603043827326, + "grad_norm": 0.9093485474586487, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5689, + "step": 18290 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3509936473079546e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18312/training_args.bin b/checkpoint-18312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-18312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-21364/config.json b/checkpoint-21364/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-21364/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-21364/generation_config.json b/checkpoint-21364/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-21364/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-21364/model-00001-of-00007.safetensors b/checkpoint-21364/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dee8da5141dc6bd487565adcc3cdd520ae95dda9 --- /dev/null +++ b/checkpoint-21364/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b1b9af313cbf3bba45242b7bd452818f6129154bab8f6aeaf72b930b9a830b +size 4886466168 diff --git a/checkpoint-21364/model-00002-of-00007.safetensors b/checkpoint-21364/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-21364/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-21364/model-00003-of-00007.safetensors b/checkpoint-21364/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-21364/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-21364/model-00004-of-00007.safetensors b/checkpoint-21364/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-21364/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-21364/model-00005-of-00007.safetensors b/checkpoint-21364/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-21364/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-21364/model-00006-of-00007.safetensors b/checkpoint-21364/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a4bd7704c532173a5723d6c585343af9680ce9b --- /dev/null +++ b/checkpoint-21364/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6347964d96ffcac95772d52f5ff15fbbfe66279665289e33f443d0975808bc99 +size 4999813120 diff --git a/checkpoint-21364/model-00007-of-00007.safetensors b/checkpoint-21364/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b874a31d262d827bd14bc613394280d9eeb56ab --- /dev/null +++ b/checkpoint-21364/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f73b4bf5362ec0a9f4965664596814a8f0d1eda1b657d78790cd8fc0dddb970 +size 2571158184 diff --git a/checkpoint-21364/model.safetensors.index.json b/checkpoint-21364/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-21364/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-21364/optimizer.pt b/checkpoint-21364/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a64b49fde77e411cbf2907b7910b3db3da668f50 --- /dev/null +++ b/checkpoint-21364/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e500b806a530688b3a49109e17f92361a8459c2add5230508a07a7274647b8ed +size 15385036334 diff --git a/checkpoint-21364/rng_state.pth b/checkpoint-21364/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-21364/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-21364/scheduler.pt b/checkpoint-21364/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17783d26dc88c55a75e7564f8dcbad9eacfa9913 --- /dev/null +++ b/checkpoint-21364/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2827eb82750c76bd3279b469098a24605426f9a47a96b155384bcef2e3f4fe20 +size 1064 diff --git a/checkpoint-21364/trainer_state.json b/checkpoint-21364/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1805dfacd5da08a6349f10522e3e8d2fbb779d79 --- /dev/null +++ b/checkpoint-21364/trainer_state.json @@ -0,0 +1,4856 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.14980033867049045, + "eval_steps": 500, + "global_step": 21364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + }, + { + "epoch": 0.08564226439437232, + "grad_norm": 1.0365880727767944, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5954, + "step": 12214 + }, + { + "epoch": 0.08585963054765752, + "grad_norm": 1.0658811330795288, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5953, + "step": 12245 + }, + { + "epoch": 0.08607699670094274, + "grad_norm": 1.046661615371704, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5786, + "step": 12276 + }, + { + "epoch": 0.08629436285422795, + "grad_norm": 1.0220295190811157, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5879, + "step": 12307 + }, + { + "epoch": 0.08651172900751315, + "grad_norm": 1.0286898612976074, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5882, + "step": 12338 + }, + { + "epoch": 0.08672909516079837, + "grad_norm": 1.060483694076538, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5887, + "step": 12369 + }, + { + "epoch": 0.08694646131408357, + "grad_norm": 1.264788269996643, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5895, + "step": 12400 + }, + { + "epoch": 0.08716382746736878, + "grad_norm": 0.9751125574111938, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5846, + "step": 12431 + }, + { + "epoch": 0.087381193620654, + "grad_norm": 1.1198588609695435, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5878, + "step": 12462 + }, + { + "epoch": 0.0875985597739392, + "grad_norm": 1.0425771474838257, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5903, + "step": 12493 + }, + { + "epoch": 0.08781592592722441, + "grad_norm": 0.9820422530174255, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.5885, + "step": 12524 + }, + { + "epoch": 0.08803329208050961, + "grad_norm": 1.0234076976776123, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5967, + "step": 12555 + }, + { + "epoch": 0.08825065823379483, + "grad_norm": 0.9766228795051575, + "learning_rate": 3.409364314116074e-05, + "loss": 0.5923, + "step": 12586 + }, + { + "epoch": 0.08846802438708004, + "grad_norm": 0.9033060073852539, + "learning_rate": 3.401536249920559e-05, + "loss": 0.5892, + "step": 12617 + }, + { + "epoch": 0.08868539054036524, + "grad_norm": 1.1264934539794922, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5917, + "step": 12648 + }, + { + "epoch": 0.08890275669365046, + "grad_norm": 1.0052556991577148, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5885, + "step": 12679 + }, + { + "epoch": 0.08912012284693566, + "grad_norm": 1.0888159275054932, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5885, + "step": 12710 + }, + { + "epoch": 0.08933748900022087, + "grad_norm": 1.0458447933197021, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5923, + "step": 12741 + }, + { + "epoch": 0.08955485515350609, + "grad_norm": 1.055434226989746, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5889, + "step": 12772 + }, + { + "epoch": 0.08977222130679129, + "grad_norm": 1.000533103942871, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5833, + "step": 12803 + }, + { + "epoch": 0.0899895874600765, + "grad_norm": 1.0112768411636353, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5886, + "step": 12834 + }, + { + "epoch": 0.0902069536133617, + "grad_norm": 0.9883559942245483, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5886, + "step": 12865 + }, + { + "epoch": 0.09042431976664692, + "grad_norm": 1.0933095216751099, + "learning_rate": 3.330636493090868e-05, + "loss": 0.5873, + "step": 12896 + }, + { + "epoch": 0.09064168591993213, + "grad_norm": 1.1089582443237305, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5845, + "step": 12927 + }, + { + "epoch": 0.09085905207321733, + "grad_norm": 1.0311237573623657, + "learning_rate": 3.314775593945251e-05, + "loss": 0.588, + "step": 12958 + }, + { + "epoch": 0.09107641822650255, + "grad_norm": 1.0495847463607788, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5839, + "step": 12989 + }, + { + "epoch": 0.09129378437978775, + "grad_norm": 1.046921730041504, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5877, + "step": 13020 + }, + { + "epoch": 0.09151115053307296, + "grad_norm": 0.9927343130111694, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5898, + "step": 13051 + }, + { + "epoch": 0.09172851668635817, + "grad_norm": 0.9319870471954346, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5886, + "step": 13082 + }, + { + "epoch": 0.09194588283964338, + "grad_norm": 1.0408544540405273, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5946, + "step": 13113 + }, + { + "epoch": 0.09216324899292859, + "grad_norm": 1.0984251499176025, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5813, + "step": 13144 + }, + { + "epoch": 0.09238061514621379, + "grad_norm": 1.0376449823379517, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5855, + "step": 13175 + }, + { + "epoch": 0.092597981299499, + "grad_norm": 0.9627104997634888, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5846, + "step": 13206 + }, + { + "epoch": 0.09281534745278422, + "grad_norm": 1.2040653228759766, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5872, + "step": 13237 + }, + { + "epoch": 0.09303271360606942, + "grad_norm": 1.0091586112976074, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5911, + "step": 13268 + }, + { + "epoch": 0.09325007975935463, + "grad_norm": 1.0941787958145142, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5844, + "step": 13299 + }, + { + "epoch": 0.09346744591263983, + "grad_norm": 0.9720754027366638, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5898, + "step": 13330 + }, + { + "epoch": 0.09368481206592505, + "grad_norm": 0.8847430944442749, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5804, + "step": 13361 + }, + { + "epoch": 0.09390217821921026, + "grad_norm": 1.0070865154266357, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5916, + "step": 13392 + }, + { + "epoch": 0.09411954437249546, + "grad_norm": 0.9538979530334473, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5984, + "step": 13423 + }, + { + "epoch": 0.09433691052578068, + "grad_norm": 1.0414469242095947, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5828, + "step": 13454 + }, + { + "epoch": 0.0945542766790659, + "grad_norm": 1.0353257656097412, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5835, + "step": 13485 + }, + { + "epoch": 0.0947716428323511, + "grad_norm": 1.0454679727554321, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5909, + "step": 13516 + }, + { + "epoch": 0.09498900898563631, + "grad_norm": 1.246871829032898, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5829, + "step": 13547 + }, + { + "epoch": 0.09520637513892151, + "grad_norm": 1.0331645011901855, + "learning_rate": 3.154262717052985e-05, + "loss": 0.5919, + "step": 13578 + }, + { + "epoch": 0.09542374129220672, + "grad_norm": 0.8984120488166809, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5853, + "step": 13609 + }, + { + "epoch": 0.09564110744549194, + "grad_norm": 1.0582929849624634, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5828, + "step": 13640 + }, + { + "epoch": 0.09585847359877714, + "grad_norm": 1.151041865348816, + "learning_rate": 3.129913267924946e-05, + "loss": 0.582, + "step": 13671 + }, + { + "epoch": 0.09607583975206235, + "grad_norm": 0.9678122401237488, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5865, + "step": 13702 + }, + { + "epoch": 0.09629320590534755, + "grad_norm": 1.033832311630249, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5797, + "step": 13733 + }, + { + "epoch": 0.09651057205863277, + "grad_norm": 0.8917058706283569, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5829, + "step": 13764 + }, + { + "epoch": 0.09672793821191798, + "grad_norm": 0.9396641850471497, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5825, + "step": 13795 + }, + { + "epoch": 0.09694530436520318, + "grad_norm": 1.0871998071670532, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.5851, + "step": 13826 + }, + { + "epoch": 0.0971626705184884, + "grad_norm": 1.1403570175170898, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.5807, + "step": 13857 + }, + { + "epoch": 0.0973800366717736, + "grad_norm": 1.0246673822402954, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5795, + "step": 13888 + }, + { + "epoch": 0.09759740282505881, + "grad_norm": 0.9431213140487671, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5825, + "step": 13919 + }, + { + "epoch": 0.09781476897834403, + "grad_norm": 1.1028188467025757, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5885, + "step": 13950 + }, + { + "epoch": 0.09803213513162923, + "grad_norm": 1.125482201576233, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5849, + "step": 13981 + }, + { + "epoch": 0.09824950128491444, + "grad_norm": 0.9831985831260681, + "learning_rate": 3.040103481317539e-05, + "loss": 0.586, + "step": 14012 + }, + { + "epoch": 0.09846686743819964, + "grad_norm": 1.0038244724273682, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5894, + "step": 14043 + }, + { + "epoch": 0.09868423359148486, + "grad_norm": 0.8766770362854004, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5773, + "step": 14074 + }, + { + "epoch": 0.09890159974477007, + "grad_norm": 0.9358529448509216, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.5887, + "step": 14105 + }, + { + "epoch": 0.09911896589805527, + "grad_norm": 0.8988749384880066, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5815, + "step": 14136 + }, + { + "epoch": 0.09933633205134049, + "grad_norm": 1.08226478099823, + "learning_rate": 2.999029669712431e-05, + "loss": 0.5944, + "step": 14167 + }, + { + "epoch": 0.09955369820462569, + "grad_norm": 1.046980619430542, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5824, + "step": 14198 + }, + { + "epoch": 0.0997710643579109, + "grad_norm": 0.96205073595047, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5847, + "step": 14229 + }, + { + "epoch": 0.09998843051119612, + "grad_norm": 0.9914370775222778, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5824, + "step": 14260 + }, + { + "epoch": 0.10020579666448132, + "grad_norm": 0.922265887260437, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5907, + "step": 14291 + }, + { + "epoch": 0.10042316281776653, + "grad_norm": 0.971159815788269, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5806, + "step": 14322 + }, + { + "epoch": 0.10064052897105173, + "grad_norm": 1.1327799558639526, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5812, + "step": 14353 + }, + { + "epoch": 0.10085789512433695, + "grad_norm": 1.1632689237594604, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5813, + "step": 14384 + }, + { + "epoch": 0.10107526127762216, + "grad_norm": 0.9936057329177856, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.5803, + "step": 14415 + }, + { + "epoch": 0.10129262743090736, + "grad_norm": 1.1123740673065186, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5767, + "step": 14446 + }, + { + "epoch": 0.10150999358419258, + "grad_norm": 0.908643364906311, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5797, + "step": 14477 + }, + { + "epoch": 0.10172735973747778, + "grad_norm": 0.8761305809020996, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.589, + "step": 14508 + }, + { + "epoch": 0.10194472589076299, + "grad_norm": 1.0904477834701538, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.5796, + "step": 14539 + }, + { + "epoch": 0.1021620920440482, + "grad_norm": 1.060160756111145, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5837, + "step": 14570 + }, + { + "epoch": 0.1023794581973334, + "grad_norm": 1.0514498949050903, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5792, + "step": 14601 + }, + { + "epoch": 0.10259682435061862, + "grad_norm": 1.0152157545089722, + "learning_rate": 2.875010077160754e-05, + "loss": 0.574, + "step": 14632 + }, + { + "epoch": 0.10281419050390382, + "grad_norm": 0.974791944026947, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5819, + "step": 14663 + }, + { + "epoch": 0.10303155665718904, + "grad_norm": 0.9271855354309082, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5798, + "step": 14694 + }, + { + "epoch": 0.10324892281047425, + "grad_norm": 0.9933458566665649, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5796, + "step": 14725 + }, + { + "epoch": 0.10346628896375945, + "grad_norm": 0.9245687127113342, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5771, + "step": 14756 + }, + { + "epoch": 0.10368365511704467, + "grad_norm": 0.9483000040054321, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5825, + "step": 14787 + }, + { + "epoch": 0.10390102127032987, + "grad_norm": 0.9054533839225769, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5817, + "step": 14818 + }, + { + "epoch": 0.10411838742361508, + "grad_norm": 1.0200084447860718, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5788, + "step": 14849 + }, + { + "epoch": 0.1043357535769003, + "grad_norm": 0.9364084005355835, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5796, + "step": 14880 + }, + { + "epoch": 0.1045531197301855, + "grad_norm": 0.9658653736114502, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5825, + "step": 14911 + }, + { + "epoch": 0.10477048588347071, + "grad_norm": 0.8612831234931946, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5842, + "step": 14942 + }, + { + "epoch": 0.10498785203675591, + "grad_norm": 0.9673991799354553, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5776, + "step": 14973 + }, + { + "epoch": 0.10520521819004112, + "grad_norm": 1.092144250869751, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5785, + "step": 15004 + }, + { + "epoch": 0.10542258434332634, + "grad_norm": 1.0162559747695923, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5821, + "step": 15035 + }, + { + "epoch": 0.10563995049661154, + "grad_norm": 0.9578939080238342, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5781, + "step": 15066 + }, + { + "epoch": 0.10585731664989675, + "grad_norm": 0.9934547543525696, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5826, + "step": 15097 + }, + { + "epoch": 0.10607468280318195, + "grad_norm": 0.9542795419692993, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.583, + "step": 15128 + }, + { + "epoch": 0.10629204895646717, + "grad_norm": 0.9436337947845459, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5832, + "step": 15159 + }, + { + "epoch": 0.10650941510975238, + "grad_norm": 0.9200606942176819, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5766, + "step": 15190 + }, + { + "epoch": 0.10672678126303758, + "grad_norm": 1.0009740591049194, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5835, + "step": 15221 + }, + { + "epoch": 0.1069441474163228, + "grad_norm": 0.9597145915031433, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5728, + "step": 15252 + }, + { + "epoch": 0.107161513569608, + "grad_norm": 1.248867154121399, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5822, + "step": 15283 + }, + { + "epoch": 0.10737887972289321, + "grad_norm": 0.8899833559989929, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5843, + "step": 15314 + }, + { + "epoch": 0.10759624587617843, + "grad_norm": 1.0085718631744385, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5811, + "step": 15345 + }, + { + "epoch": 0.10781361202946363, + "grad_norm": 0.9277573227882385, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.5774, + "step": 15376 + }, + { + "epoch": 0.10803097818274884, + "grad_norm": 1.199010968208313, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5735, + "step": 15407 + }, + { + "epoch": 0.10824834433603404, + "grad_norm": 0.9361419081687927, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5834, + "step": 15438 + }, + { + "epoch": 0.10846571048931926, + "grad_norm": 1.05440092086792, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5721, + "step": 15469 + }, + { + "epoch": 0.10868307664260447, + "grad_norm": 1.0973948240280151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5838, + "step": 15500 + }, + { + "epoch": 0.10890044279588967, + "grad_norm": 0.9417588710784912, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5786, + "step": 15531 + }, + { + "epoch": 0.10911780894917489, + "grad_norm": 0.8763983845710754, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5766, + "step": 15562 + }, + { + "epoch": 0.10933517510246009, + "grad_norm": 1.0105509757995605, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.5896, + "step": 15593 + }, + { + "epoch": 0.1095525412557453, + "grad_norm": 0.9172413349151611, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5788, + "step": 15624 + }, + { + "epoch": 0.10976990740903052, + "grad_norm": 0.9640869498252869, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5791, + "step": 15655 + }, + { + "epoch": 0.10998727356231572, + "grad_norm": 1.0987275838851929, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.581, + "step": 15686 + }, + { + "epoch": 0.11020463971560093, + "grad_norm": 1.0418893098831177, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.569, + "step": 15717 + }, + { + "epoch": 0.11042200586888613, + "grad_norm": 1.0216400623321533, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5834, + "step": 15748 + }, + { + "epoch": 0.11063937202217135, + "grad_norm": 1.0211747884750366, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5733, + "step": 15779 + }, + { + "epoch": 0.11085673817545656, + "grad_norm": 0.9743130207061768, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5789, + "step": 15810 + }, + { + "epoch": 0.11107410432874176, + "grad_norm": 1.1765626668930054, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.57, + "step": 15841 + }, + { + "epoch": 0.11129147048202698, + "grad_norm": 0.9354963898658752, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5894, + "step": 15872 + }, + { + "epoch": 0.11150883663531218, + "grad_norm": 0.8743797540664673, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5716, + "step": 15903 + }, + { + "epoch": 0.11172620278859739, + "grad_norm": 1.1076644659042358, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5787, + "step": 15934 + }, + { + "epoch": 0.1119435689418826, + "grad_norm": 0.9156807065010071, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5764, + "step": 15965 + }, + { + "epoch": 0.11216093509516781, + "grad_norm": 1.0239089727401733, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5748, + "step": 15996 + }, + { + "epoch": 0.11237830124845302, + "grad_norm": 1.5095417499542236, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5815, + "step": 16027 + }, + { + "epoch": 0.11259566740173822, + "grad_norm": 0.9298838973045349, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5742, + "step": 16058 + }, + { + "epoch": 0.11281303355502344, + "grad_norm": 0.8956617712974548, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5856, + "step": 16089 + }, + { + "epoch": 0.11303039970830865, + "grad_norm": 0.950095534324646, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5786, + "step": 16120 + }, + { + "epoch": 0.11324776586159385, + "grad_norm": 1.0230988264083862, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5771, + "step": 16151 + }, + { + "epoch": 0.11346513201487907, + "grad_norm": 1.1018470525741577, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.57, + "step": 16182 + }, + { + "epoch": 0.11368249816816427, + "grad_norm": 0.9700168371200562, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.5689, + "step": 16213 + }, + { + "epoch": 0.11389986432144948, + "grad_norm": 0.9069929718971252, + "learning_rate": 2.439728136286796e-05, + "loss": 0.5719, + "step": 16244 + }, + { + "epoch": 0.1141172304747347, + "grad_norm": 0.9254815578460693, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5704, + "step": 16275 + }, + { + "epoch": 0.1143345966280199, + "grad_norm": 0.9150753021240234, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5754, + "step": 16306 + }, + { + "epoch": 0.11455196278130511, + "grad_norm": 1.003201961517334, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5746, + "step": 16337 + }, + { + "epoch": 0.11476932893459031, + "grad_norm": 1.1016685962677002, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5763, + "step": 16368 + }, + { + "epoch": 0.11498669508787553, + "grad_norm": 1.0079994201660156, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5721, + "step": 16399 + }, + { + "epoch": 0.11520406124116074, + "grad_norm": 0.989470899105072, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5795, + "step": 16430 + }, + { + "epoch": 0.11542142739444594, + "grad_norm": 1.039035439491272, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5737, + "step": 16461 + }, + { + "epoch": 0.11563879354773116, + "grad_norm": 0.8659546971321106, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5711, + "step": 16492 + }, + { + "epoch": 0.11585615970101636, + "grad_norm": 0.9558688998222351, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5763, + "step": 16523 + }, + { + "epoch": 0.11607352585430157, + "grad_norm": 1.0017194747924805, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5772, + "step": 16554 + }, + { + "epoch": 0.11629089200758679, + "grad_norm": 1.0045812129974365, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5744, + "step": 16585 + }, + { + "epoch": 0.11650825816087199, + "grad_norm": 0.8719637393951416, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5681, + "step": 16616 + }, + { + "epoch": 0.1167256243141572, + "grad_norm": 0.9029743075370789, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5807, + "step": 16647 + }, + { + "epoch": 0.1169429904674424, + "grad_norm": 0.9439691305160522, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5751, + "step": 16678 + }, + { + "epoch": 0.11716035662072762, + "grad_norm": 0.900688648223877, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5772, + "step": 16709 + }, + { + "epoch": 0.11737772277401283, + "grad_norm": 0.8884438872337341, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5758, + "step": 16740 + }, + { + "epoch": 0.11759508892729803, + "grad_norm": 0.9252585172653198, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5705, + "step": 16771 + }, + { + "epoch": 0.11781245508058324, + "grad_norm": 0.9447957873344421, + "learning_rate": 2.288805948824212e-05, + "loss": 0.566, + "step": 16802 + }, + { + "epoch": 0.11802982123386845, + "grad_norm": 0.9666566252708435, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5745, + "step": 16833 + }, + { + "epoch": 0.11824718738715366, + "grad_norm": 0.9459251761436462, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.5775, + "step": 16864 + }, + { + "epoch": 0.11846455354043887, + "grad_norm": 0.8863123059272766, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5716, + "step": 16895 + }, + { + "epoch": 0.11868191969372408, + "grad_norm": 0.9847676753997803, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5763, + "step": 16926 + }, + { + "epoch": 0.11889928584700929, + "grad_norm": 1.1111658811569214, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5752, + "step": 16957 + }, + { + "epoch": 0.11911665200029449, + "grad_norm": 1.0046541690826416, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.574, + "step": 16988 + }, + { + "epoch": 0.1193340181535797, + "grad_norm": 0.8580814599990845, + "learning_rate": 2.230292185905114e-05, + "loss": 0.572, + "step": 17019 + }, + { + "epoch": 0.11955138430686492, + "grad_norm": 0.9188304543495178, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.571, + "step": 17050 + }, + { + "epoch": 0.11976875046015012, + "grad_norm": 0.9079185724258423, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5792, + "step": 17081 + }, + { + "epoch": 0.11998611661343533, + "grad_norm": 0.9194979071617126, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5753, + "step": 17112 + }, + { + "epoch": 0.12020348276672053, + "grad_norm": 0.8398452997207642, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5653, + "step": 17143 + }, + { + "epoch": 0.12042084892000575, + "grad_norm": 0.9888772368431091, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5747, + "step": 17174 + }, + { + "epoch": 0.12063821507329096, + "grad_norm": 0.9137700796127319, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5728, + "step": 17205 + }, + { + "epoch": 0.12085558122657616, + "grad_norm": 1.058064579963684, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5717, + "step": 17236 + }, + { + "epoch": 0.12107294737986138, + "grad_norm": 0.9835705757141113, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.5725, + "step": 17267 + }, + { + "epoch": 0.12129031353314658, + "grad_norm": 0.918863832950592, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5676, + "step": 17298 + }, + { + "epoch": 0.1215076796864318, + "grad_norm": 0.9384900331497192, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5768, + "step": 17329 + }, + { + "epoch": 0.12172504583971701, + "grad_norm": 1.060088038444519, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5642, + "step": 17360 + }, + { + "epoch": 0.12194241199300221, + "grad_norm": 0.999266505241394, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5669, + "step": 17391 + }, + { + "epoch": 0.12215977814628742, + "grad_norm": 0.8633130788803101, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5661, + "step": 17422 + }, + { + "epoch": 0.12237714429957262, + "grad_norm": 0.9396159052848816, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5743, + "step": 17453 + }, + { + "epoch": 0.12259451045285784, + "grad_norm": 0.9990928173065186, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5707, + "step": 17484 + }, + { + "epoch": 0.12281187660614305, + "grad_norm": 0.9732767939567566, + "learning_rate": 2.097158366805287e-05, + "loss": 0.571, + "step": 17515 + }, + { + "epoch": 0.12302924275942825, + "grad_norm": 20.362672805786133, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5586, + "step": 17546 + }, + { + "epoch": 0.12324660891271347, + "grad_norm": 0.976889431476593, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5693, + "step": 17577 + }, + { + "epoch": 0.12346397506599867, + "grad_norm": 0.907172679901123, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5714, + "step": 17608 + }, + { + "epoch": 0.12368134121928388, + "grad_norm": 0.8816654086112976, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5669, + "step": 17639 + }, + { + "epoch": 0.1238987073725691, + "grad_norm": 0.9616197943687439, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.5739, + "step": 17670 + }, + { + "epoch": 0.1241160735258543, + "grad_norm": 0.9188937544822693, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5689, + "step": 17701 + }, + { + "epoch": 0.12433343967913951, + "grad_norm": 0.9845620393753052, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5716, + "step": 17732 + }, + { + "epoch": 0.12455080583242471, + "grad_norm": 0.8922098278999329, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5648, + "step": 17763 + }, + { + "epoch": 0.12476817198570993, + "grad_norm": 0.9715010523796082, + "learning_rate": 2.022757379528727e-05, + "loss": 0.5664, + "step": 17794 + }, + { + "epoch": 0.12498553813899514, + "grad_norm": 1.0769156217575073, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5689, + "step": 17825 + }, + { + "epoch": 0.12520290429228034, + "grad_norm": 0.9304386973381042, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5772, + "step": 17856 + }, + { + "epoch": 0.12542027044556556, + "grad_norm": 1.0523558855056763, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5686, + "step": 17887 + }, + { + "epoch": 0.12563763659885077, + "grad_norm": 1.029064655303955, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5629, + "step": 17918 + }, + { + "epoch": 0.12585500275213599, + "grad_norm": 1.0367600917816162, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5682, + "step": 17949 + }, + { + "epoch": 0.12607236890542117, + "grad_norm": 1.047844648361206, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.571, + "step": 17980 + }, + { + "epoch": 0.1262897350587064, + "grad_norm": 0.9374393820762634, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5731, + "step": 18011 + }, + { + "epoch": 0.1265071012119916, + "grad_norm": 1.0163381099700928, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.5769, + "step": 18042 + }, + { + "epoch": 0.12672446736527682, + "grad_norm": 0.9243590235710144, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5697, + "step": 18073 + }, + { + "epoch": 0.12694183351856203, + "grad_norm": 1.0359089374542236, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5639, + "step": 18104 + }, + { + "epoch": 0.12715919967184722, + "grad_norm": 0.841151773929596, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5792, + "step": 18135 + }, + { + "epoch": 0.12737656582513243, + "grad_norm": 1.0070539712905884, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5669, + "step": 18166 + }, + { + "epoch": 0.12759393197841765, + "grad_norm": 0.9453309178352356, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5683, + "step": 18197 + }, + { + "epoch": 0.12781129813170286, + "grad_norm": 0.9628680348396301, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5711, + "step": 18228 + }, + { + "epoch": 0.12802866428498808, + "grad_norm": 0.9396767616271973, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5709, + "step": 18259 + }, + { + "epoch": 0.12824603043827326, + "grad_norm": 0.9093485474586487, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5689, + "step": 18290 + }, + { + "epoch": 0.12846339659155848, + "grad_norm": 0.8730084896087646, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5744, + "step": 18321 + }, + { + "epoch": 0.1286807627448437, + "grad_norm": 0.9706755876541138, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5572, + "step": 18352 + }, + { + "epoch": 0.1288981288981289, + "grad_norm": 0.9472910165786743, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5726, + "step": 18383 + }, + { + "epoch": 0.12911549505141412, + "grad_norm": 0.9355587959289551, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5673, + "step": 18414 + }, + { + "epoch": 0.1293328612046993, + "grad_norm": 0.9303567409515381, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5616, + "step": 18445 + }, + { + "epoch": 0.12955022735798452, + "grad_norm": 0.9067112803459167, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5661, + "step": 18476 + }, + { + "epoch": 0.12976759351126974, + "grad_norm": 0.899079442024231, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5726, + "step": 18507 + }, + { + "epoch": 0.12998495966455495, + "grad_norm": 0.8478329181671143, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5633, + "step": 18538 + }, + { + "epoch": 0.13020232581784016, + "grad_norm": 0.910685122013092, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5683, + "step": 18569 + }, + { + "epoch": 0.13041969197112535, + "grad_norm": 0.9179863333702087, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5753, + "step": 18600 + }, + { + "epoch": 0.13063705812441057, + "grad_norm": 0.9042870402336121, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5752, + "step": 18631 + }, + { + "epoch": 0.13085442427769578, + "grad_norm": 0.9494644999504089, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5635, + "step": 18662 + }, + { + "epoch": 0.131071790430981, + "grad_norm": 0.9707177877426147, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5623, + "step": 18693 + }, + { + "epoch": 0.1312891565842662, + "grad_norm": 0.9590293169021606, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5799, + "step": 18724 + }, + { + "epoch": 0.1315065227375514, + "grad_norm": 0.9343449473381042, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5757, + "step": 18755 + }, + { + "epoch": 0.1317238888908366, + "grad_norm": 0.9229467511177063, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5644, + "step": 18786 + }, + { + "epoch": 0.13194125504412182, + "grad_norm": 0.9312314987182617, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5693, + "step": 18817 + }, + { + "epoch": 0.13215862119740704, + "grad_norm": 0.8548254370689392, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5713, + "step": 18848 + }, + { + "epoch": 0.13237598735069225, + "grad_norm": 1.0379942655563354, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5696, + "step": 18879 + }, + { + "epoch": 0.13259335350397744, + "grad_norm": 1.0847291946411133, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5656, + "step": 18910 + }, + { + "epoch": 0.13281071965726265, + "grad_norm": 0.969327449798584, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.5654, + "step": 18941 + }, + { + "epoch": 0.13302808581054787, + "grad_norm": 0.9928266406059265, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5635, + "step": 18972 + }, + { + "epoch": 0.13324545196383308, + "grad_norm": 0.8415375351905823, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.572, + "step": 19003 + }, + { + "epoch": 0.1334628181171183, + "grad_norm": 0.9909110069274902, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5727, + "step": 19034 + }, + { + "epoch": 0.13368018427040348, + "grad_norm": 1.0183087587356567, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5767, + "step": 19065 + }, + { + "epoch": 0.1338975504236887, + "grad_norm": 0.9055935144424438, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5628, + "step": 19096 + }, + { + "epoch": 0.1341149165769739, + "grad_norm": 0.8832345008850098, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5688, + "step": 19127 + }, + { + "epoch": 0.13433228273025913, + "grad_norm": 1.1259726285934448, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5578, + "step": 19158 + }, + { + "epoch": 0.13454964888354434, + "grad_norm": 0.9167343378067017, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.5636, + "step": 19189 + }, + { + "epoch": 0.13476701503682953, + "grad_norm": 0.9861068725585938, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5681, + "step": 19220 + }, + { + "epoch": 0.13498438119011474, + "grad_norm": 0.9800103306770325, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5689, + "step": 19251 + }, + { + "epoch": 0.13520174734339996, + "grad_norm": 0.9900636672973633, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5625, + "step": 19282 + }, + { + "epoch": 0.13541911349668517, + "grad_norm": 0.9756057858467102, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5634, + "step": 19313 + }, + { + "epoch": 0.1356364796499704, + "grad_norm": 0.9184322953224182, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5713, + "step": 19344 + }, + { + "epoch": 0.13585384580325557, + "grad_norm": 1.003735065460205, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5626, + "step": 19375 + }, + { + "epoch": 0.1360712119565408, + "grad_norm": 0.8933300375938416, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5705, + "step": 19406 + }, + { + "epoch": 0.136288578109826, + "grad_norm": 0.997909426689148, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5645, + "step": 19437 + }, + { + "epoch": 0.13650594426311122, + "grad_norm": 0.9039232730865479, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5551, + "step": 19468 + }, + { + "epoch": 0.13672331041639643, + "grad_norm": 0.9416874647140503, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5621, + "step": 19499 + }, + { + "epoch": 0.13694067656968162, + "grad_norm": 0.8743234872817993, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5692, + "step": 19530 + }, + { + "epoch": 0.13715804272296683, + "grad_norm": 1.0159176588058472, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5622, + "step": 19561 + }, + { + "epoch": 0.13737540887625205, + "grad_norm": 0.8633915781974792, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5688, + "step": 19592 + }, + { + "epoch": 0.13759277502953726, + "grad_norm": 0.9839888215065002, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.5691, + "step": 19623 + }, + { + "epoch": 0.13781014118282248, + "grad_norm": 1.0715723037719727, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5582, + "step": 19654 + }, + { + "epoch": 0.13802750733610766, + "grad_norm": 1.029173493385315, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5678, + "step": 19685 + }, + { + "epoch": 0.13824487348939288, + "grad_norm": 1.1011470556259155, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5692, + "step": 19716 + }, + { + "epoch": 0.1384622396426781, + "grad_norm": 0.9993789196014404, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5645, + "step": 19747 + }, + { + "epoch": 0.1386796057959633, + "grad_norm": 1.0202093124389648, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5703, + "step": 19778 + }, + { + "epoch": 0.13889697194924852, + "grad_norm": 1.0126008987426758, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5639, + "step": 19809 + }, + { + "epoch": 0.1391143381025337, + "grad_norm": 1.0468281507492065, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5683, + "step": 19840 + }, + { + "epoch": 0.13933170425581892, + "grad_norm": 0.9329802393913269, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.5597, + "step": 19871 + }, + { + "epoch": 0.13954907040910414, + "grad_norm": 0.891503632068634, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5728, + "step": 19902 + }, + { + "epoch": 0.13976643656238935, + "grad_norm": 0.9752770662307739, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.564, + "step": 19933 + }, + { + "epoch": 0.13998380271567457, + "grad_norm": 0.8956452012062073, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5661, + "step": 19964 + }, + { + "epoch": 0.14020116886895975, + "grad_norm": 1.072753667831421, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.562, + "step": 19995 + }, + { + "epoch": 0.14041853502224497, + "grad_norm": 0.8971157670021057, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5613, + "step": 20026 + }, + { + "epoch": 0.14063590117553018, + "grad_norm": 0.8919452428817749, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5659, + "step": 20057 + }, + { + "epoch": 0.1408532673288154, + "grad_norm": 0.9752078056335449, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5687, + "step": 20088 + }, + { + "epoch": 0.1410706334821006, + "grad_norm": 0.9520591497421265, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.5673, + "step": 20119 + }, + { + "epoch": 0.1412879996353858, + "grad_norm": 0.8892295956611633, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.5705, + "step": 20150 + }, + { + "epoch": 0.141505365788671, + "grad_norm": 0.9576200842857361, + "learning_rate": 1.410916653306954e-05, + "loss": 0.5667, + "step": 20181 + }, + { + "epoch": 0.14172273194195623, + "grad_norm": 0.9564182162284851, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5595, + "step": 20212 + }, + { + "epoch": 0.14194009809524144, + "grad_norm": 0.9247251749038696, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.5709, + "step": 20243 + }, + { + "epoch": 0.14215746424852665, + "grad_norm": 0.9523617625236511, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5591, + "step": 20274 + }, + { + "epoch": 0.14237483040181184, + "grad_norm": 0.9751485586166382, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5678, + "step": 20305 + }, + { + "epoch": 0.14259219655509706, + "grad_norm": 1.0090728998184204, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5569, + "step": 20336 + }, + { + "epoch": 0.14280956270838227, + "grad_norm": 0.8991780281066895, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5611, + "step": 20367 + }, + { + "epoch": 0.14302692886166748, + "grad_norm": 0.8665379285812378, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5639, + "step": 20398 + }, + { + "epoch": 0.1432442950149527, + "grad_norm": 0.9348465204238892, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.5582, + "step": 20429 + }, + { + "epoch": 0.1434616611682379, + "grad_norm": 0.8632979989051819, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5672, + "step": 20460 + }, + { + "epoch": 0.1436790273215231, + "grad_norm": 0.9019519686698914, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5642, + "step": 20491 + }, + { + "epoch": 0.14389639347480832, + "grad_norm": 0.8994531035423279, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5663, + "step": 20522 + }, + { + "epoch": 0.14411375962809353, + "grad_norm": 0.9270524978637695, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5643, + "step": 20553 + }, + { + "epoch": 0.14433112578137874, + "grad_norm": 0.8957355618476868, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5632, + "step": 20584 + }, + { + "epoch": 0.14454849193466393, + "grad_norm": 1.0234413146972656, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5647, + "step": 20615 + }, + { + "epoch": 0.14476585808794915, + "grad_norm": 0.8956789970397949, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.5635, + "step": 20646 + }, + { + "epoch": 0.14498322424123436, + "grad_norm": 0.883823037147522, + "learning_rate": 1.291596270869846e-05, + "loss": 0.5702, + "step": 20677 + }, + { + "epoch": 0.14520059039451957, + "grad_norm": 0.8809013366699219, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5641, + "step": 20708 + }, + { + "epoch": 0.1454179565478048, + "grad_norm": 0.9803751707077026, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5604, + "step": 20739 + }, + { + "epoch": 0.14563532270108998, + "grad_norm": 0.8637491464614868, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.558, + "step": 20770 + }, + { + "epoch": 0.1458526888543752, + "grad_norm": 0.8922715187072754, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5619, + "step": 20801 + }, + { + "epoch": 0.1460700550076604, + "grad_norm": 0.9750674366950989, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5664, + "step": 20832 + }, + { + "epoch": 0.14628742116094562, + "grad_norm": 1.0473570823669434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.5646, + "step": 20863 + }, + { + "epoch": 0.14650478731423083, + "grad_norm": 1.130385160446167, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5617, + "step": 20894 + }, + { + "epoch": 0.14672215346751602, + "grad_norm": 0.9984387755393982, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.567, + "step": 20925 + }, + { + "epoch": 0.14693951962080123, + "grad_norm": 0.9383957982063293, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5567, + "step": 20956 + }, + { + "epoch": 0.14715688577408645, + "grad_norm": 0.981935977935791, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5651, + "step": 20987 + }, + { + "epoch": 0.14737425192737166, + "grad_norm": 0.9774724841117859, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.5652, + "step": 21018 + }, + { + "epoch": 0.14759161808065688, + "grad_norm": 0.9714674949645996, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.561, + "step": 21049 + }, + { + "epoch": 0.14780898423394206, + "grad_norm": 0.8881489038467407, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5594, + "step": 21080 + }, + { + "epoch": 0.14802635038722728, + "grad_norm": 0.961926281452179, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5611, + "step": 21111 + }, + { + "epoch": 0.1482437165405125, + "grad_norm": 0.9101502895355225, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5628, + "step": 21142 + }, + { + "epoch": 0.1484610826937977, + "grad_norm": 0.9001050591468811, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5576, + "step": 21173 + }, + { + "epoch": 0.14867844884708292, + "grad_norm": 0.9724435210227966, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.568, + "step": 21204 + }, + { + "epoch": 0.1488958150003681, + "grad_norm": 0.825156569480896, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.5634, + "step": 21235 + }, + { + "epoch": 0.14911318115365332, + "grad_norm": 0.9625114798545837, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5667, + "step": 21266 + }, + { + "epoch": 0.14933054730693854, + "grad_norm": 1.0243901014328003, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5629, + "step": 21297 + }, + { + "epoch": 0.14954791346022375, + "grad_norm": 0.9247808456420898, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.563, + "step": 21328 + }, + { + "epoch": 0.14976527961350897, + "grad_norm": 0.8996061682701111, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5644, + "step": 21359 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5761592551926137e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-21364/training_args.bin b/checkpoint-21364/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-21364/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-24416/config.json b/checkpoint-24416/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-24416/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-24416/generation_config.json b/checkpoint-24416/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-24416/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-24416/model-00001-of-00007.safetensors b/checkpoint-24416/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38647fb2f7753a002ac093e7941b588d272ace29 --- /dev/null +++ b/checkpoint-24416/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00ac09a819db4d41b3962a0e8a0617fd5fef4c30ba0fec279aef236934266448 +size 4886466168 diff --git a/checkpoint-24416/model-00002-of-00007.safetensors b/checkpoint-24416/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-24416/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-24416/model-00003-of-00007.safetensors b/checkpoint-24416/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-24416/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-24416/model-00004-of-00007.safetensors b/checkpoint-24416/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-24416/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-24416/model-00005-of-00007.safetensors b/checkpoint-24416/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-24416/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-24416/model-00006-of-00007.safetensors b/checkpoint-24416/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..306080971988fd34230c01531f2221aa7003f98b --- /dev/null +++ b/checkpoint-24416/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b299a3c85f59a8ff2aef8ebc57b168a930faf7e86e4a9305244f4cc66096a9 +size 4999813120 diff --git a/checkpoint-24416/model-00007-of-00007.safetensors b/checkpoint-24416/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5018067077fca2785cbc200a4f9371dc8f306810 --- /dev/null +++ b/checkpoint-24416/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00a60f2ef8ab8ec4ae2a96f24426926b5c927dc62ce7e7b4f750357fc9a60f24 +size 2571158184 diff --git a/checkpoint-24416/model.safetensors.index.json b/checkpoint-24416/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-24416/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-24416/optimizer.pt b/checkpoint-24416/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f291b034942e3628c41adf8d15ff3168c000fb60 --- /dev/null +++ b/checkpoint-24416/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b282b0d72cc7aaa734071dc0fc69de5eee1c883c21512a48dad823c16ddb81 +size 15385036334 diff --git a/checkpoint-24416/rng_state.pth b/checkpoint-24416/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-24416/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-24416/scheduler.pt b/checkpoint-24416/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c42ab2446b20c095538f06fcf92f01ac58007a07 --- /dev/null +++ b/checkpoint-24416/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719f421c0e2563868e52a38d7c300a4ceee2dbf15648505f514dae6bb8a5e723 +size 1064 diff --git a/checkpoint-24416/trainer_state.json b/checkpoint-24416/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..049363ef1308e3a12948292e1d001eb6fd5f653b --- /dev/null +++ b/checkpoint-24416/trainer_state.json @@ -0,0 +1,5542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1712003870519891, + "eval_steps": 500, + "global_step": 24416, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + }, + { + "epoch": 0.08564226439437232, + "grad_norm": 1.0365880727767944, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5954, + "step": 12214 + }, + { + "epoch": 0.08585963054765752, + "grad_norm": 1.0658811330795288, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5953, + "step": 12245 + }, + { + "epoch": 0.08607699670094274, + "grad_norm": 1.046661615371704, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5786, + "step": 12276 + }, + { + "epoch": 0.08629436285422795, + "grad_norm": 1.0220295190811157, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5879, + "step": 12307 + }, + { + "epoch": 0.08651172900751315, + "grad_norm": 1.0286898612976074, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5882, + "step": 12338 + }, + { + "epoch": 0.08672909516079837, + "grad_norm": 1.060483694076538, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5887, + "step": 12369 + }, + { + "epoch": 0.08694646131408357, + "grad_norm": 1.264788269996643, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5895, + "step": 12400 + }, + { + "epoch": 0.08716382746736878, + "grad_norm": 0.9751125574111938, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5846, + "step": 12431 + }, + { + "epoch": 0.087381193620654, + "grad_norm": 1.1198588609695435, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5878, + "step": 12462 + }, + { + "epoch": 0.0875985597739392, + "grad_norm": 1.0425771474838257, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5903, + "step": 12493 + }, + { + "epoch": 0.08781592592722441, + "grad_norm": 0.9820422530174255, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.5885, + "step": 12524 + }, + { + "epoch": 0.08803329208050961, + "grad_norm": 1.0234076976776123, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5967, + "step": 12555 + }, + { + "epoch": 0.08825065823379483, + "grad_norm": 0.9766228795051575, + "learning_rate": 3.409364314116074e-05, + "loss": 0.5923, + "step": 12586 + }, + { + "epoch": 0.08846802438708004, + "grad_norm": 0.9033060073852539, + "learning_rate": 3.401536249920559e-05, + "loss": 0.5892, + "step": 12617 + }, + { + "epoch": 0.08868539054036524, + "grad_norm": 1.1264934539794922, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5917, + "step": 12648 + }, + { + "epoch": 0.08890275669365046, + "grad_norm": 1.0052556991577148, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5885, + "step": 12679 + }, + { + "epoch": 0.08912012284693566, + "grad_norm": 1.0888159275054932, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5885, + "step": 12710 + }, + { + "epoch": 0.08933748900022087, + "grad_norm": 1.0458447933197021, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5923, + "step": 12741 + }, + { + "epoch": 0.08955485515350609, + "grad_norm": 1.055434226989746, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5889, + "step": 12772 + }, + { + "epoch": 0.08977222130679129, + "grad_norm": 1.000533103942871, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5833, + "step": 12803 + }, + { + "epoch": 0.0899895874600765, + "grad_norm": 1.0112768411636353, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5886, + "step": 12834 + }, + { + "epoch": 0.0902069536133617, + "grad_norm": 0.9883559942245483, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5886, + "step": 12865 + }, + { + "epoch": 0.09042431976664692, + "grad_norm": 1.0933095216751099, + "learning_rate": 3.330636493090868e-05, + "loss": 0.5873, + "step": 12896 + }, + { + "epoch": 0.09064168591993213, + "grad_norm": 1.1089582443237305, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5845, + "step": 12927 + }, + { + "epoch": 0.09085905207321733, + "grad_norm": 1.0311237573623657, + "learning_rate": 3.314775593945251e-05, + "loss": 0.588, + "step": 12958 + }, + { + "epoch": 0.09107641822650255, + "grad_norm": 1.0495847463607788, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5839, + "step": 12989 + }, + { + "epoch": 0.09129378437978775, + "grad_norm": 1.046921730041504, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5877, + "step": 13020 + }, + { + "epoch": 0.09151115053307296, + "grad_norm": 0.9927343130111694, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5898, + "step": 13051 + }, + { + "epoch": 0.09172851668635817, + "grad_norm": 0.9319870471954346, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5886, + "step": 13082 + }, + { + "epoch": 0.09194588283964338, + "grad_norm": 1.0408544540405273, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5946, + "step": 13113 + }, + { + "epoch": 0.09216324899292859, + "grad_norm": 1.0984251499176025, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5813, + "step": 13144 + }, + { + "epoch": 0.09238061514621379, + "grad_norm": 1.0376449823379517, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5855, + "step": 13175 + }, + { + "epoch": 0.092597981299499, + "grad_norm": 0.9627104997634888, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5846, + "step": 13206 + }, + { + "epoch": 0.09281534745278422, + "grad_norm": 1.2040653228759766, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5872, + "step": 13237 + }, + { + "epoch": 0.09303271360606942, + "grad_norm": 1.0091586112976074, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5911, + "step": 13268 + }, + { + "epoch": 0.09325007975935463, + "grad_norm": 1.0941787958145142, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5844, + "step": 13299 + }, + { + "epoch": 0.09346744591263983, + "grad_norm": 0.9720754027366638, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5898, + "step": 13330 + }, + { + "epoch": 0.09368481206592505, + "grad_norm": 0.8847430944442749, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5804, + "step": 13361 + }, + { + "epoch": 0.09390217821921026, + "grad_norm": 1.0070865154266357, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5916, + "step": 13392 + }, + { + "epoch": 0.09411954437249546, + "grad_norm": 0.9538979530334473, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5984, + "step": 13423 + }, + { + "epoch": 0.09433691052578068, + "grad_norm": 1.0414469242095947, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5828, + "step": 13454 + }, + { + "epoch": 0.0945542766790659, + "grad_norm": 1.0353257656097412, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5835, + "step": 13485 + }, + { + "epoch": 0.0947716428323511, + "grad_norm": 1.0454679727554321, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5909, + "step": 13516 + }, + { + "epoch": 0.09498900898563631, + "grad_norm": 1.246871829032898, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5829, + "step": 13547 + }, + { + "epoch": 0.09520637513892151, + "grad_norm": 1.0331645011901855, + "learning_rate": 3.154262717052985e-05, + "loss": 0.5919, + "step": 13578 + }, + { + "epoch": 0.09542374129220672, + "grad_norm": 0.8984120488166809, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5853, + "step": 13609 + }, + { + "epoch": 0.09564110744549194, + "grad_norm": 1.0582929849624634, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5828, + "step": 13640 + }, + { + "epoch": 0.09585847359877714, + "grad_norm": 1.151041865348816, + "learning_rate": 3.129913267924946e-05, + "loss": 0.582, + "step": 13671 + }, + { + "epoch": 0.09607583975206235, + "grad_norm": 0.9678122401237488, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5865, + "step": 13702 + }, + { + "epoch": 0.09629320590534755, + "grad_norm": 1.033832311630249, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5797, + "step": 13733 + }, + { + "epoch": 0.09651057205863277, + "grad_norm": 0.8917058706283569, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5829, + "step": 13764 + }, + { + "epoch": 0.09672793821191798, + "grad_norm": 0.9396641850471497, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5825, + "step": 13795 + }, + { + "epoch": 0.09694530436520318, + "grad_norm": 1.0871998071670532, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.5851, + "step": 13826 + }, + { + "epoch": 0.0971626705184884, + "grad_norm": 1.1403570175170898, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.5807, + "step": 13857 + }, + { + "epoch": 0.0973800366717736, + "grad_norm": 1.0246673822402954, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5795, + "step": 13888 + }, + { + "epoch": 0.09759740282505881, + "grad_norm": 0.9431213140487671, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5825, + "step": 13919 + }, + { + "epoch": 0.09781476897834403, + "grad_norm": 1.1028188467025757, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5885, + "step": 13950 + }, + { + "epoch": 0.09803213513162923, + "grad_norm": 1.125482201576233, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5849, + "step": 13981 + }, + { + "epoch": 0.09824950128491444, + "grad_norm": 0.9831985831260681, + "learning_rate": 3.040103481317539e-05, + "loss": 0.586, + "step": 14012 + }, + { + "epoch": 0.09846686743819964, + "grad_norm": 1.0038244724273682, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5894, + "step": 14043 + }, + { + "epoch": 0.09868423359148486, + "grad_norm": 0.8766770362854004, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5773, + "step": 14074 + }, + { + "epoch": 0.09890159974477007, + "grad_norm": 0.9358529448509216, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.5887, + "step": 14105 + }, + { + "epoch": 0.09911896589805527, + "grad_norm": 0.8988749384880066, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5815, + "step": 14136 + }, + { + "epoch": 0.09933633205134049, + "grad_norm": 1.08226478099823, + "learning_rate": 2.999029669712431e-05, + "loss": 0.5944, + "step": 14167 + }, + { + "epoch": 0.09955369820462569, + "grad_norm": 1.046980619430542, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5824, + "step": 14198 + }, + { + "epoch": 0.0997710643579109, + "grad_norm": 0.96205073595047, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5847, + "step": 14229 + }, + { + "epoch": 0.09998843051119612, + "grad_norm": 0.9914370775222778, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5824, + "step": 14260 + }, + { + "epoch": 0.10020579666448132, + "grad_norm": 0.922265887260437, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5907, + "step": 14291 + }, + { + "epoch": 0.10042316281776653, + "grad_norm": 0.971159815788269, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5806, + "step": 14322 + }, + { + "epoch": 0.10064052897105173, + "grad_norm": 1.1327799558639526, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5812, + "step": 14353 + }, + { + "epoch": 0.10085789512433695, + "grad_norm": 1.1632689237594604, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5813, + "step": 14384 + }, + { + "epoch": 0.10107526127762216, + "grad_norm": 0.9936057329177856, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.5803, + "step": 14415 + }, + { + "epoch": 0.10129262743090736, + "grad_norm": 1.1123740673065186, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5767, + "step": 14446 + }, + { + "epoch": 0.10150999358419258, + "grad_norm": 0.908643364906311, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5797, + "step": 14477 + }, + { + "epoch": 0.10172735973747778, + "grad_norm": 0.8761305809020996, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.589, + "step": 14508 + }, + { + "epoch": 0.10194472589076299, + "grad_norm": 1.0904477834701538, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.5796, + "step": 14539 + }, + { + "epoch": 0.1021620920440482, + "grad_norm": 1.060160756111145, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5837, + "step": 14570 + }, + { + "epoch": 0.1023794581973334, + "grad_norm": 1.0514498949050903, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5792, + "step": 14601 + }, + { + "epoch": 0.10259682435061862, + "grad_norm": 1.0152157545089722, + "learning_rate": 2.875010077160754e-05, + "loss": 0.574, + "step": 14632 + }, + { + "epoch": 0.10281419050390382, + "grad_norm": 0.974791944026947, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5819, + "step": 14663 + }, + { + "epoch": 0.10303155665718904, + "grad_norm": 0.9271855354309082, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5798, + "step": 14694 + }, + { + "epoch": 0.10324892281047425, + "grad_norm": 0.9933458566665649, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5796, + "step": 14725 + }, + { + "epoch": 0.10346628896375945, + "grad_norm": 0.9245687127113342, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5771, + "step": 14756 + }, + { + "epoch": 0.10368365511704467, + "grad_norm": 0.9483000040054321, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5825, + "step": 14787 + }, + { + "epoch": 0.10390102127032987, + "grad_norm": 0.9054533839225769, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5817, + "step": 14818 + }, + { + "epoch": 0.10411838742361508, + "grad_norm": 1.0200084447860718, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5788, + "step": 14849 + }, + { + "epoch": 0.1043357535769003, + "grad_norm": 0.9364084005355835, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5796, + "step": 14880 + }, + { + "epoch": 0.1045531197301855, + "grad_norm": 0.9658653736114502, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5825, + "step": 14911 + }, + { + "epoch": 0.10477048588347071, + "grad_norm": 0.8612831234931946, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5842, + "step": 14942 + }, + { + "epoch": 0.10498785203675591, + "grad_norm": 0.9673991799354553, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5776, + "step": 14973 + }, + { + "epoch": 0.10520521819004112, + "grad_norm": 1.092144250869751, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5785, + "step": 15004 + }, + { + "epoch": 0.10542258434332634, + "grad_norm": 1.0162559747695923, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5821, + "step": 15035 + }, + { + "epoch": 0.10563995049661154, + "grad_norm": 0.9578939080238342, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5781, + "step": 15066 + }, + { + "epoch": 0.10585731664989675, + "grad_norm": 0.9934547543525696, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5826, + "step": 15097 + }, + { + "epoch": 0.10607468280318195, + "grad_norm": 0.9542795419692993, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.583, + "step": 15128 + }, + { + "epoch": 0.10629204895646717, + "grad_norm": 0.9436337947845459, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5832, + "step": 15159 + }, + { + "epoch": 0.10650941510975238, + "grad_norm": 0.9200606942176819, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5766, + "step": 15190 + }, + { + "epoch": 0.10672678126303758, + "grad_norm": 1.0009740591049194, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5835, + "step": 15221 + }, + { + "epoch": 0.1069441474163228, + "grad_norm": 0.9597145915031433, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5728, + "step": 15252 + }, + { + "epoch": 0.107161513569608, + "grad_norm": 1.248867154121399, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5822, + "step": 15283 + }, + { + "epoch": 0.10737887972289321, + "grad_norm": 0.8899833559989929, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5843, + "step": 15314 + }, + { + "epoch": 0.10759624587617843, + "grad_norm": 1.0085718631744385, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5811, + "step": 15345 + }, + { + "epoch": 0.10781361202946363, + "grad_norm": 0.9277573227882385, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.5774, + "step": 15376 + }, + { + "epoch": 0.10803097818274884, + "grad_norm": 1.199010968208313, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5735, + "step": 15407 + }, + { + "epoch": 0.10824834433603404, + "grad_norm": 0.9361419081687927, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5834, + "step": 15438 + }, + { + "epoch": 0.10846571048931926, + "grad_norm": 1.05440092086792, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5721, + "step": 15469 + }, + { + "epoch": 0.10868307664260447, + "grad_norm": 1.0973948240280151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5838, + "step": 15500 + }, + { + "epoch": 0.10890044279588967, + "grad_norm": 0.9417588710784912, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5786, + "step": 15531 + }, + { + "epoch": 0.10911780894917489, + "grad_norm": 0.8763983845710754, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5766, + "step": 15562 + }, + { + "epoch": 0.10933517510246009, + "grad_norm": 1.0105509757995605, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.5896, + "step": 15593 + }, + { + "epoch": 0.1095525412557453, + "grad_norm": 0.9172413349151611, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5788, + "step": 15624 + }, + { + "epoch": 0.10976990740903052, + "grad_norm": 0.9640869498252869, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5791, + "step": 15655 + }, + { + "epoch": 0.10998727356231572, + "grad_norm": 1.0987275838851929, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.581, + "step": 15686 + }, + { + "epoch": 0.11020463971560093, + "grad_norm": 1.0418893098831177, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.569, + "step": 15717 + }, + { + "epoch": 0.11042200586888613, + "grad_norm": 1.0216400623321533, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5834, + "step": 15748 + }, + { + "epoch": 0.11063937202217135, + "grad_norm": 1.0211747884750366, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5733, + "step": 15779 + }, + { + "epoch": 0.11085673817545656, + "grad_norm": 0.9743130207061768, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5789, + "step": 15810 + }, + { + "epoch": 0.11107410432874176, + "grad_norm": 1.1765626668930054, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.57, + "step": 15841 + }, + { + "epoch": 0.11129147048202698, + "grad_norm": 0.9354963898658752, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5894, + "step": 15872 + }, + { + "epoch": 0.11150883663531218, + "grad_norm": 0.8743797540664673, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5716, + "step": 15903 + }, + { + "epoch": 0.11172620278859739, + "grad_norm": 1.1076644659042358, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5787, + "step": 15934 + }, + { + "epoch": 0.1119435689418826, + "grad_norm": 0.9156807065010071, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5764, + "step": 15965 + }, + { + "epoch": 0.11216093509516781, + "grad_norm": 1.0239089727401733, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5748, + "step": 15996 + }, + { + "epoch": 0.11237830124845302, + "grad_norm": 1.5095417499542236, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5815, + "step": 16027 + }, + { + "epoch": 0.11259566740173822, + "grad_norm": 0.9298838973045349, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5742, + "step": 16058 + }, + { + "epoch": 0.11281303355502344, + "grad_norm": 0.8956617712974548, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5856, + "step": 16089 + }, + { + "epoch": 0.11303039970830865, + "grad_norm": 0.950095534324646, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5786, + "step": 16120 + }, + { + "epoch": 0.11324776586159385, + "grad_norm": 1.0230988264083862, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5771, + "step": 16151 + }, + { + "epoch": 0.11346513201487907, + "grad_norm": 1.1018470525741577, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.57, + "step": 16182 + }, + { + "epoch": 0.11368249816816427, + "grad_norm": 0.9700168371200562, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.5689, + "step": 16213 + }, + { + "epoch": 0.11389986432144948, + "grad_norm": 0.9069929718971252, + "learning_rate": 2.439728136286796e-05, + "loss": 0.5719, + "step": 16244 + }, + { + "epoch": 0.1141172304747347, + "grad_norm": 0.9254815578460693, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5704, + "step": 16275 + }, + { + "epoch": 0.1143345966280199, + "grad_norm": 0.9150753021240234, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5754, + "step": 16306 + }, + { + "epoch": 0.11455196278130511, + "grad_norm": 1.003201961517334, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5746, + "step": 16337 + }, + { + "epoch": 0.11476932893459031, + "grad_norm": 1.1016685962677002, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5763, + "step": 16368 + }, + { + "epoch": 0.11498669508787553, + "grad_norm": 1.0079994201660156, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5721, + "step": 16399 + }, + { + "epoch": 0.11520406124116074, + "grad_norm": 0.989470899105072, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5795, + "step": 16430 + }, + { + "epoch": 0.11542142739444594, + "grad_norm": 1.039035439491272, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5737, + "step": 16461 + }, + { + "epoch": 0.11563879354773116, + "grad_norm": 0.8659546971321106, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5711, + "step": 16492 + }, + { + "epoch": 0.11585615970101636, + "grad_norm": 0.9558688998222351, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5763, + "step": 16523 + }, + { + "epoch": 0.11607352585430157, + "grad_norm": 1.0017194747924805, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5772, + "step": 16554 + }, + { + "epoch": 0.11629089200758679, + "grad_norm": 1.0045812129974365, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5744, + "step": 16585 + }, + { + "epoch": 0.11650825816087199, + "grad_norm": 0.8719637393951416, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5681, + "step": 16616 + }, + { + "epoch": 0.1167256243141572, + "grad_norm": 0.9029743075370789, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5807, + "step": 16647 + }, + { + "epoch": 0.1169429904674424, + "grad_norm": 0.9439691305160522, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5751, + "step": 16678 + }, + { + "epoch": 0.11716035662072762, + "grad_norm": 0.900688648223877, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5772, + "step": 16709 + }, + { + "epoch": 0.11737772277401283, + "grad_norm": 0.8884438872337341, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5758, + "step": 16740 + }, + { + "epoch": 0.11759508892729803, + "grad_norm": 0.9252585172653198, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5705, + "step": 16771 + }, + { + "epoch": 0.11781245508058324, + "grad_norm": 0.9447957873344421, + "learning_rate": 2.288805948824212e-05, + "loss": 0.566, + "step": 16802 + }, + { + "epoch": 0.11802982123386845, + "grad_norm": 0.9666566252708435, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5745, + "step": 16833 + }, + { + "epoch": 0.11824718738715366, + "grad_norm": 0.9459251761436462, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.5775, + "step": 16864 + }, + { + "epoch": 0.11846455354043887, + "grad_norm": 0.8863123059272766, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5716, + "step": 16895 + }, + { + "epoch": 0.11868191969372408, + "grad_norm": 0.9847676753997803, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5763, + "step": 16926 + }, + { + "epoch": 0.11889928584700929, + "grad_norm": 1.1111658811569214, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5752, + "step": 16957 + }, + { + "epoch": 0.11911665200029449, + "grad_norm": 1.0046541690826416, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.574, + "step": 16988 + }, + { + "epoch": 0.1193340181535797, + "grad_norm": 0.8580814599990845, + "learning_rate": 2.230292185905114e-05, + "loss": 0.572, + "step": 17019 + }, + { + "epoch": 0.11955138430686492, + "grad_norm": 0.9188304543495178, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.571, + "step": 17050 + }, + { + "epoch": 0.11976875046015012, + "grad_norm": 0.9079185724258423, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5792, + "step": 17081 + }, + { + "epoch": 0.11998611661343533, + "grad_norm": 0.9194979071617126, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5753, + "step": 17112 + }, + { + "epoch": 0.12020348276672053, + "grad_norm": 0.8398452997207642, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5653, + "step": 17143 + }, + { + "epoch": 0.12042084892000575, + "grad_norm": 0.9888772368431091, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5747, + "step": 17174 + }, + { + "epoch": 0.12063821507329096, + "grad_norm": 0.9137700796127319, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5728, + "step": 17205 + }, + { + "epoch": 0.12085558122657616, + "grad_norm": 1.058064579963684, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5717, + "step": 17236 + }, + { + "epoch": 0.12107294737986138, + "grad_norm": 0.9835705757141113, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.5725, + "step": 17267 + }, + { + "epoch": 0.12129031353314658, + "grad_norm": 0.918863832950592, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5676, + "step": 17298 + }, + { + "epoch": 0.1215076796864318, + "grad_norm": 0.9384900331497192, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5768, + "step": 17329 + }, + { + "epoch": 0.12172504583971701, + "grad_norm": 1.060088038444519, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5642, + "step": 17360 + }, + { + "epoch": 0.12194241199300221, + "grad_norm": 0.999266505241394, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5669, + "step": 17391 + }, + { + "epoch": 0.12215977814628742, + "grad_norm": 0.8633130788803101, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5661, + "step": 17422 + }, + { + "epoch": 0.12237714429957262, + "grad_norm": 0.9396159052848816, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5743, + "step": 17453 + }, + { + "epoch": 0.12259451045285784, + "grad_norm": 0.9990928173065186, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5707, + "step": 17484 + }, + { + "epoch": 0.12281187660614305, + "grad_norm": 0.9732767939567566, + "learning_rate": 2.097158366805287e-05, + "loss": 0.571, + "step": 17515 + }, + { + "epoch": 0.12302924275942825, + "grad_norm": 20.362672805786133, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5586, + "step": 17546 + }, + { + "epoch": 0.12324660891271347, + "grad_norm": 0.976889431476593, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5693, + "step": 17577 + }, + { + "epoch": 0.12346397506599867, + "grad_norm": 0.907172679901123, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5714, + "step": 17608 + }, + { + "epoch": 0.12368134121928388, + "grad_norm": 0.8816654086112976, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5669, + "step": 17639 + }, + { + "epoch": 0.1238987073725691, + "grad_norm": 0.9616197943687439, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.5739, + "step": 17670 + }, + { + "epoch": 0.1241160735258543, + "grad_norm": 0.9188937544822693, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5689, + "step": 17701 + }, + { + "epoch": 0.12433343967913951, + "grad_norm": 0.9845620393753052, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5716, + "step": 17732 + }, + { + "epoch": 0.12455080583242471, + "grad_norm": 0.8922098278999329, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5648, + "step": 17763 + }, + { + "epoch": 0.12476817198570993, + "grad_norm": 0.9715010523796082, + "learning_rate": 2.022757379528727e-05, + "loss": 0.5664, + "step": 17794 + }, + { + "epoch": 0.12498553813899514, + "grad_norm": 1.0769156217575073, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5689, + "step": 17825 + }, + { + "epoch": 0.12520290429228034, + "grad_norm": 0.9304386973381042, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5772, + "step": 17856 + }, + { + "epoch": 0.12542027044556556, + "grad_norm": 1.0523558855056763, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5686, + "step": 17887 + }, + { + "epoch": 0.12563763659885077, + "grad_norm": 1.029064655303955, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5629, + "step": 17918 + }, + { + "epoch": 0.12585500275213599, + "grad_norm": 1.0367600917816162, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5682, + "step": 17949 + }, + { + "epoch": 0.12607236890542117, + "grad_norm": 1.047844648361206, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.571, + "step": 17980 + }, + { + "epoch": 0.1262897350587064, + "grad_norm": 0.9374393820762634, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5731, + "step": 18011 + }, + { + "epoch": 0.1265071012119916, + "grad_norm": 1.0163381099700928, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.5769, + "step": 18042 + }, + { + "epoch": 0.12672446736527682, + "grad_norm": 0.9243590235710144, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5697, + "step": 18073 + }, + { + "epoch": 0.12694183351856203, + "grad_norm": 1.0359089374542236, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5639, + "step": 18104 + }, + { + "epoch": 0.12715919967184722, + "grad_norm": 0.841151773929596, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5792, + "step": 18135 + }, + { + "epoch": 0.12737656582513243, + "grad_norm": 1.0070539712905884, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5669, + "step": 18166 + }, + { + "epoch": 0.12759393197841765, + "grad_norm": 0.9453309178352356, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5683, + "step": 18197 + }, + { + "epoch": 0.12781129813170286, + "grad_norm": 0.9628680348396301, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5711, + "step": 18228 + }, + { + "epoch": 0.12802866428498808, + "grad_norm": 0.9396767616271973, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5709, + "step": 18259 + }, + { + "epoch": 0.12824603043827326, + "grad_norm": 0.9093485474586487, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5689, + "step": 18290 + }, + { + "epoch": 0.12846339659155848, + "grad_norm": 0.8730084896087646, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5744, + "step": 18321 + }, + { + "epoch": 0.1286807627448437, + "grad_norm": 0.9706755876541138, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5572, + "step": 18352 + }, + { + "epoch": 0.1288981288981289, + "grad_norm": 0.9472910165786743, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5726, + "step": 18383 + }, + { + "epoch": 0.12911549505141412, + "grad_norm": 0.9355587959289551, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5673, + "step": 18414 + }, + { + "epoch": 0.1293328612046993, + "grad_norm": 0.9303567409515381, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5616, + "step": 18445 + }, + { + "epoch": 0.12955022735798452, + "grad_norm": 0.9067112803459167, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5661, + "step": 18476 + }, + { + "epoch": 0.12976759351126974, + "grad_norm": 0.899079442024231, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5726, + "step": 18507 + }, + { + "epoch": 0.12998495966455495, + "grad_norm": 0.8478329181671143, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5633, + "step": 18538 + }, + { + "epoch": 0.13020232581784016, + "grad_norm": 0.910685122013092, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5683, + "step": 18569 + }, + { + "epoch": 0.13041969197112535, + "grad_norm": 0.9179863333702087, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5753, + "step": 18600 + }, + { + "epoch": 0.13063705812441057, + "grad_norm": 0.9042870402336121, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5752, + "step": 18631 + }, + { + "epoch": 0.13085442427769578, + "grad_norm": 0.9494644999504089, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5635, + "step": 18662 + }, + { + "epoch": 0.131071790430981, + "grad_norm": 0.9707177877426147, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5623, + "step": 18693 + }, + { + "epoch": 0.1312891565842662, + "grad_norm": 0.9590293169021606, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5799, + "step": 18724 + }, + { + "epoch": 0.1315065227375514, + "grad_norm": 0.9343449473381042, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5757, + "step": 18755 + }, + { + "epoch": 0.1317238888908366, + "grad_norm": 0.9229467511177063, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5644, + "step": 18786 + }, + { + "epoch": 0.13194125504412182, + "grad_norm": 0.9312314987182617, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5693, + "step": 18817 + }, + { + "epoch": 0.13215862119740704, + "grad_norm": 0.8548254370689392, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5713, + "step": 18848 + }, + { + "epoch": 0.13237598735069225, + "grad_norm": 1.0379942655563354, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5696, + "step": 18879 + }, + { + "epoch": 0.13259335350397744, + "grad_norm": 1.0847291946411133, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5656, + "step": 18910 + }, + { + "epoch": 0.13281071965726265, + "grad_norm": 0.969327449798584, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.5654, + "step": 18941 + }, + { + "epoch": 0.13302808581054787, + "grad_norm": 0.9928266406059265, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5635, + "step": 18972 + }, + { + "epoch": 0.13324545196383308, + "grad_norm": 0.8415375351905823, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.572, + "step": 19003 + }, + { + "epoch": 0.1334628181171183, + "grad_norm": 0.9909110069274902, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5727, + "step": 19034 + }, + { + "epoch": 0.13368018427040348, + "grad_norm": 1.0183087587356567, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5767, + "step": 19065 + }, + { + "epoch": 0.1338975504236887, + "grad_norm": 0.9055935144424438, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5628, + "step": 19096 + }, + { + "epoch": 0.1341149165769739, + "grad_norm": 0.8832345008850098, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5688, + "step": 19127 + }, + { + "epoch": 0.13433228273025913, + "grad_norm": 1.1259726285934448, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5578, + "step": 19158 + }, + { + "epoch": 0.13454964888354434, + "grad_norm": 0.9167343378067017, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.5636, + "step": 19189 + }, + { + "epoch": 0.13476701503682953, + "grad_norm": 0.9861068725585938, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5681, + "step": 19220 + }, + { + "epoch": 0.13498438119011474, + "grad_norm": 0.9800103306770325, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5689, + "step": 19251 + }, + { + "epoch": 0.13520174734339996, + "grad_norm": 0.9900636672973633, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5625, + "step": 19282 + }, + { + "epoch": 0.13541911349668517, + "grad_norm": 0.9756057858467102, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5634, + "step": 19313 + }, + { + "epoch": 0.1356364796499704, + "grad_norm": 0.9184322953224182, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5713, + "step": 19344 + }, + { + "epoch": 0.13585384580325557, + "grad_norm": 1.003735065460205, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5626, + "step": 19375 + }, + { + "epoch": 0.1360712119565408, + "grad_norm": 0.8933300375938416, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5705, + "step": 19406 + }, + { + "epoch": 0.136288578109826, + "grad_norm": 0.997909426689148, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5645, + "step": 19437 + }, + { + "epoch": 0.13650594426311122, + "grad_norm": 0.9039232730865479, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5551, + "step": 19468 + }, + { + "epoch": 0.13672331041639643, + "grad_norm": 0.9416874647140503, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5621, + "step": 19499 + }, + { + "epoch": 0.13694067656968162, + "grad_norm": 0.8743234872817993, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5692, + "step": 19530 + }, + { + "epoch": 0.13715804272296683, + "grad_norm": 1.0159176588058472, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5622, + "step": 19561 + }, + { + "epoch": 0.13737540887625205, + "grad_norm": 0.8633915781974792, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5688, + "step": 19592 + }, + { + "epoch": 0.13759277502953726, + "grad_norm": 0.9839888215065002, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.5691, + "step": 19623 + }, + { + "epoch": 0.13781014118282248, + "grad_norm": 1.0715723037719727, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5582, + "step": 19654 + }, + { + "epoch": 0.13802750733610766, + "grad_norm": 1.029173493385315, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5678, + "step": 19685 + }, + { + "epoch": 0.13824487348939288, + "grad_norm": 1.1011470556259155, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5692, + "step": 19716 + }, + { + "epoch": 0.1384622396426781, + "grad_norm": 0.9993789196014404, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5645, + "step": 19747 + }, + { + "epoch": 0.1386796057959633, + "grad_norm": 1.0202093124389648, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5703, + "step": 19778 + }, + { + "epoch": 0.13889697194924852, + "grad_norm": 1.0126008987426758, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5639, + "step": 19809 + }, + { + "epoch": 0.1391143381025337, + "grad_norm": 1.0468281507492065, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5683, + "step": 19840 + }, + { + "epoch": 0.13933170425581892, + "grad_norm": 0.9329802393913269, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.5597, + "step": 19871 + }, + { + "epoch": 0.13954907040910414, + "grad_norm": 0.891503632068634, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5728, + "step": 19902 + }, + { + "epoch": 0.13976643656238935, + "grad_norm": 0.9752770662307739, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.564, + "step": 19933 + }, + { + "epoch": 0.13998380271567457, + "grad_norm": 0.8956452012062073, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5661, + "step": 19964 + }, + { + "epoch": 0.14020116886895975, + "grad_norm": 1.072753667831421, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.562, + "step": 19995 + }, + { + "epoch": 0.14041853502224497, + "grad_norm": 0.8971157670021057, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5613, + "step": 20026 + }, + { + "epoch": 0.14063590117553018, + "grad_norm": 0.8919452428817749, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5659, + "step": 20057 + }, + { + "epoch": 0.1408532673288154, + "grad_norm": 0.9752078056335449, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5687, + "step": 20088 + }, + { + "epoch": 0.1410706334821006, + "grad_norm": 0.9520591497421265, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.5673, + "step": 20119 + }, + { + "epoch": 0.1412879996353858, + "grad_norm": 0.8892295956611633, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.5705, + "step": 20150 + }, + { + "epoch": 0.141505365788671, + "grad_norm": 0.9576200842857361, + "learning_rate": 1.410916653306954e-05, + "loss": 0.5667, + "step": 20181 + }, + { + "epoch": 0.14172273194195623, + "grad_norm": 0.9564182162284851, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5595, + "step": 20212 + }, + { + "epoch": 0.14194009809524144, + "grad_norm": 0.9247251749038696, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.5709, + "step": 20243 + }, + { + "epoch": 0.14215746424852665, + "grad_norm": 0.9523617625236511, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5591, + "step": 20274 + }, + { + "epoch": 0.14237483040181184, + "grad_norm": 0.9751485586166382, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5678, + "step": 20305 + }, + { + "epoch": 0.14259219655509706, + "grad_norm": 1.0090728998184204, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5569, + "step": 20336 + }, + { + "epoch": 0.14280956270838227, + "grad_norm": 0.8991780281066895, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5611, + "step": 20367 + }, + { + "epoch": 0.14302692886166748, + "grad_norm": 0.8665379285812378, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5639, + "step": 20398 + }, + { + "epoch": 0.1432442950149527, + "grad_norm": 0.9348465204238892, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.5582, + "step": 20429 + }, + { + "epoch": 0.1434616611682379, + "grad_norm": 0.8632979989051819, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5672, + "step": 20460 + }, + { + "epoch": 0.1436790273215231, + "grad_norm": 0.9019519686698914, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5642, + "step": 20491 + }, + { + "epoch": 0.14389639347480832, + "grad_norm": 0.8994531035423279, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5663, + "step": 20522 + }, + { + "epoch": 0.14411375962809353, + "grad_norm": 0.9270524978637695, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5643, + "step": 20553 + }, + { + "epoch": 0.14433112578137874, + "grad_norm": 0.8957355618476868, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5632, + "step": 20584 + }, + { + "epoch": 0.14454849193466393, + "grad_norm": 1.0234413146972656, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5647, + "step": 20615 + }, + { + "epoch": 0.14476585808794915, + "grad_norm": 0.8956789970397949, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.5635, + "step": 20646 + }, + { + "epoch": 0.14498322424123436, + "grad_norm": 0.883823037147522, + "learning_rate": 1.291596270869846e-05, + "loss": 0.5702, + "step": 20677 + }, + { + "epoch": 0.14520059039451957, + "grad_norm": 0.8809013366699219, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5641, + "step": 20708 + }, + { + "epoch": 0.1454179565478048, + "grad_norm": 0.9803751707077026, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5604, + "step": 20739 + }, + { + "epoch": 0.14563532270108998, + "grad_norm": 0.8637491464614868, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.558, + "step": 20770 + }, + { + "epoch": 0.1458526888543752, + "grad_norm": 0.8922715187072754, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5619, + "step": 20801 + }, + { + "epoch": 0.1460700550076604, + "grad_norm": 0.9750674366950989, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5664, + "step": 20832 + }, + { + "epoch": 0.14628742116094562, + "grad_norm": 1.0473570823669434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.5646, + "step": 20863 + }, + { + "epoch": 0.14650478731423083, + "grad_norm": 1.130385160446167, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5617, + "step": 20894 + }, + { + "epoch": 0.14672215346751602, + "grad_norm": 0.9984387755393982, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.567, + "step": 20925 + }, + { + "epoch": 0.14693951962080123, + "grad_norm": 0.9383957982063293, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5567, + "step": 20956 + }, + { + "epoch": 0.14715688577408645, + "grad_norm": 0.981935977935791, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5651, + "step": 20987 + }, + { + "epoch": 0.14737425192737166, + "grad_norm": 0.9774724841117859, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.5652, + "step": 21018 + }, + { + "epoch": 0.14759161808065688, + "grad_norm": 0.9714674949645996, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.561, + "step": 21049 + }, + { + "epoch": 0.14780898423394206, + "grad_norm": 0.8881489038467407, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5594, + "step": 21080 + }, + { + "epoch": 0.14802635038722728, + "grad_norm": 0.961926281452179, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5611, + "step": 21111 + }, + { + "epoch": 0.1482437165405125, + "grad_norm": 0.9101502895355225, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5628, + "step": 21142 + }, + { + "epoch": 0.1484610826937977, + "grad_norm": 0.9001050591468811, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5576, + "step": 21173 + }, + { + "epoch": 0.14867844884708292, + "grad_norm": 0.9724435210227966, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.568, + "step": 21204 + }, + { + "epoch": 0.1488958150003681, + "grad_norm": 0.825156569480896, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.5634, + "step": 21235 + }, + { + "epoch": 0.14911318115365332, + "grad_norm": 0.9625114798545837, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5667, + "step": 21266 + }, + { + "epoch": 0.14933054730693854, + "grad_norm": 1.0243901014328003, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5629, + "step": 21297 + }, + { + "epoch": 0.14954791346022375, + "grad_norm": 0.9247808456420898, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.563, + "step": 21328 + }, + { + "epoch": 0.14976527961350897, + "grad_norm": 0.8996061682701111, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5644, + "step": 21359 + }, + { + "epoch": 0.14998264576679415, + "grad_norm": 0.9766656160354614, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.566, + "step": 21390 + }, + { + "epoch": 0.15020001192007937, + "grad_norm": 0.9848279356956482, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5648, + "step": 21421 + }, + { + "epoch": 0.15041737807336458, + "grad_norm": 0.972819447517395, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5573, + "step": 21452 + }, + { + "epoch": 0.1506347442266498, + "grad_norm": 0.8826684951782227, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5625, + "step": 21483 + }, + { + "epoch": 0.150852110379935, + "grad_norm": 0.9768727421760559, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5609, + "step": 21514 + }, + { + "epoch": 0.1510694765332202, + "grad_norm": 0.9463690519332886, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5605, + "step": 21545 + }, + { + "epoch": 0.1512868426865054, + "grad_norm": 0.9010226130485535, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5626, + "step": 21576 + }, + { + "epoch": 0.15150420883979063, + "grad_norm": 0.9793362617492676, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5629, + "step": 21607 + }, + { + "epoch": 0.15172157499307584, + "grad_norm": 0.8516845107078552, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5655, + "step": 21638 + }, + { + "epoch": 0.15193894114636106, + "grad_norm": 0.9344280958175659, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5678, + "step": 21669 + }, + { + "epoch": 0.15215630729964624, + "grad_norm": 0.9141379594802856, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5651, + "step": 21700 + }, + { + "epoch": 0.15237367345293146, + "grad_norm": 0.9709919691085815, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5534, + "step": 21731 + }, + { + "epoch": 0.15259103960621667, + "grad_norm": 0.9237218499183655, + "learning_rate": 1.043211714185722e-05, + "loss": 0.5615, + "step": 21762 + }, + { + "epoch": 0.1528084057595019, + "grad_norm": 0.8852784633636475, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5576, + "step": 21793 + }, + { + "epoch": 0.1530257719127871, + "grad_norm": 0.8654377460479736, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.5583, + "step": 21824 + }, + { + "epoch": 0.1532431380660723, + "grad_norm": 1.0151047706604004, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.5626, + "step": 21855 + }, + { + "epoch": 0.1534605042193575, + "grad_norm": 1.0742695331573486, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5546, + "step": 21886 + }, + { + "epoch": 0.15367787037264272, + "grad_norm": 0.9627267718315125, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5701, + "step": 21917 + }, + { + "epoch": 0.15389523652592793, + "grad_norm": 0.9896987080574036, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5573, + "step": 21948 + }, + { + "epoch": 0.15411260267921315, + "grad_norm": 0.8868485689163208, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5553, + "step": 21979 + }, + { + "epoch": 0.15432996883249833, + "grad_norm": 0.8650690317153931, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5638, + "step": 22010 + }, + { + "epoch": 0.15454733498578355, + "grad_norm": 0.8827528357505798, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5618, + "step": 22041 + }, + { + "epoch": 0.15476470113906876, + "grad_norm": 0.9161486625671387, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5547, + "step": 22072 + }, + { + "epoch": 0.15498206729235398, + "grad_norm": 0.8655954599380493, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5575, + "step": 22103 + }, + { + "epoch": 0.1551994334456392, + "grad_norm": 1.1528652906417847, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5562, + "step": 22134 + }, + { + "epoch": 0.15541679959892438, + "grad_norm": 0.9214157462120056, + "learning_rate": 9.559031610258007e-06, + "loss": 0.5505, + "step": 22165 + }, + { + "epoch": 0.1556341657522096, + "grad_norm": 0.9822834730148315, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5651, + "step": 22196 + }, + { + "epoch": 0.1558515319054948, + "grad_norm": 1.0093454122543335, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5629, + "step": 22227 + }, + { + "epoch": 0.15606889805878002, + "grad_norm": 0.9251008629798889, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5644, + "step": 22258 + }, + { + "epoch": 0.15628626421206523, + "grad_norm": 0.9593933820724487, + "learning_rate": 9.29623762843734e-06, + "loss": 0.5626, + "step": 22289 + }, + { + "epoch": 0.15650363036535042, + "grad_norm": 0.9322303533554077, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5549, + "step": 22320 + }, + { + "epoch": 0.15672099651863564, + "grad_norm": 1.0490275621414185, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5527, + "step": 22351 + }, + { + "epoch": 0.15693836267192085, + "grad_norm": 0.9138365387916565, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5609, + "step": 22382 + }, + { + "epoch": 0.15715572882520606, + "grad_norm": 0.9704885482788086, + "learning_rate": 9.036279043803565e-06, + "loss": 0.5559, + "step": 22413 + }, + { + "epoch": 0.15737309497849128, + "grad_norm": 0.9594223499298096, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5533, + "step": 22444 + }, + { + "epoch": 0.15759046113177647, + "grad_norm": 0.9496825933456421, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5631, + "step": 22475 + }, + { + "epoch": 0.15780782728506168, + "grad_norm": 0.8646016120910645, + "learning_rate": 8.843199089695293e-06, + "loss": 0.561, + "step": 22506 + }, + { + "epoch": 0.1580251934383469, + "grad_norm": 0.8263002038002014, + "learning_rate": 8.779202793251311e-06, + "loss": 0.555, + "step": 22537 + }, + { + "epoch": 0.1582425595916321, + "grad_norm": 0.9070886969566345, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5577, + "step": 22568 + }, + { + "epoch": 0.15845992574491732, + "grad_norm": 0.8829283118247986, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5608, + "step": 22599 + }, + { + "epoch": 0.1586772918982025, + "grad_norm": 0.8605303764343262, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5505, + "step": 22630 + }, + { + "epoch": 0.15889465805148772, + "grad_norm": 0.9638768434524536, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5614, + "step": 22661 + }, + { + "epoch": 0.15911202420477294, + "grad_norm": 0.908811628818512, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5606, + "step": 22692 + }, + { + "epoch": 0.15932939035805815, + "grad_norm": 0.9718073010444641, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5615, + "step": 22723 + }, + { + "epoch": 0.15954675651134337, + "grad_norm": 0.9598197937011719, + "learning_rate": 8.336394285228017e-06, + "loss": 0.5512, + "step": 22754 + }, + { + "epoch": 0.15976412266462856, + "grad_norm": 0.8870306611061096, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5565, + "step": 22785 + }, + { + "epoch": 0.15998148881791377, + "grad_norm": 0.7993106842041016, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5529, + "step": 22816 + }, + { + "epoch": 0.16019885497119898, + "grad_norm": 0.8731540441513062, + "learning_rate": 8.149425683199823e-06, + "loss": 0.5586, + "step": 22847 + }, + { + "epoch": 0.1604162211244842, + "grad_norm": 0.8460251688957214, + "learning_rate": 8.08748219313325e-06, + "loss": 0.5583, + "step": 22878 + }, + { + "epoch": 0.1606335872777694, + "grad_norm": 0.9626048803329468, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5558, + "step": 22909 + }, + { + "epoch": 0.1608509534310546, + "grad_norm": 0.9286885261535645, + "learning_rate": 7.964168476264508e-06, + "loss": 0.5611, + "step": 22940 + }, + { + "epoch": 0.16106831958433981, + "grad_norm": 0.8666262030601501, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5646, + "step": 22971 + }, + { + "epoch": 0.16128568573762503, + "grad_norm": 0.9536890387535095, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5562, + "step": 23002 + }, + { + "epoch": 0.16150305189091024, + "grad_norm": 0.9607664942741394, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5623, + "step": 23033 + }, + { + "epoch": 0.16172041804419546, + "grad_norm": 0.9009374380111694, + "learning_rate": 7.719853532125227e-06, + "loss": 0.5578, + "step": 23064 + }, + { + "epoch": 0.16193778419748064, + "grad_norm": 0.7630103230476379, + "learning_rate": 7.65926058865258e-06, + "loss": 0.5601, + "step": 23095 + }, + { + "epoch": 0.16215515035076586, + "grad_norm": 0.8619088530540466, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5571, + "step": 23126 + }, + { + "epoch": 0.16237251650405107, + "grad_norm": 1.0103073120117188, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5587, + "step": 23157 + }, + { + "epoch": 0.1625898826573363, + "grad_norm": 0.9380328059196472, + "learning_rate": 7.478658609642211e-06, + "loss": 0.555, + "step": 23188 + }, + { + "epoch": 0.1628072488106215, + "grad_norm": 0.9486220479011536, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5658, + "step": 23219 + }, + { + "epoch": 0.1630246149639067, + "grad_norm": 0.9146499633789062, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5503, + "step": 23250 + }, + { + "epoch": 0.1632419811171919, + "grad_norm": 0.911389946937561, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5584, + "step": 23281 + }, + { + "epoch": 0.16345934727047712, + "grad_norm": 0.8847711086273193, + "learning_rate": 7.240627257831847e-06, + "loss": 0.5494, + "step": 23312 + }, + { + "epoch": 0.16367671342376233, + "grad_norm": 0.9155315160751343, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.548, + "step": 23343 + }, + { + "epoch": 0.16389407957704755, + "grad_norm": 0.8847165703773499, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5512, + "step": 23374 + }, + { + "epoch": 0.16411144573033273, + "grad_norm": 1.0043821334838867, + "learning_rate": 7.064205712766226e-06, + "loss": 0.554, + "step": 23405 + }, + { + "epoch": 0.16432881188361795, + "grad_norm": 0.9789336323738098, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5579, + "step": 23436 + }, + { + "epoch": 0.16454617803690316, + "grad_norm": 0.8675613403320312, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5547, + "step": 23467 + }, + { + "epoch": 0.16476354419018838, + "grad_norm": 1.0360661745071411, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5575, + "step": 23498 + }, + { + "epoch": 0.1649809103434736, + "grad_norm": 0.9654151201248169, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5575, + "step": 23529 + }, + { + "epoch": 0.16519827649675878, + "grad_norm": 0.886508584022522, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5552, + "step": 23560 + }, + { + "epoch": 0.165415642650044, + "grad_norm": 0.8399243950843811, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5494, + "step": 23591 + }, + { + "epoch": 0.1656330088033292, + "grad_norm": 0.876013457775116, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5589, + "step": 23622 + }, + { + "epoch": 0.16585037495661442, + "grad_norm": 0.8546695709228516, + "learning_rate": 6.602702003200872e-06, + "loss": 0.5558, + "step": 23653 + }, + { + "epoch": 0.16606774110989964, + "grad_norm": 0.8829993605613708, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5605, + "step": 23684 + }, + { + "epoch": 0.16628510726318482, + "grad_norm": 0.8759157657623291, + "learning_rate": 6.489389252651057e-06, + "loss": 0.5546, + "step": 23715 + }, + { + "epoch": 0.16650247341647004, + "grad_norm": 0.9579117894172668, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.5507, + "step": 23746 + }, + { + "epoch": 0.16671983956975525, + "grad_norm": 0.9086149334907532, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5594, + "step": 23777 + }, + { + "epoch": 0.16693720572304047, + "grad_norm": 0.889070451259613, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5591, + "step": 23808 + }, + { + "epoch": 0.16715457187632568, + "grad_norm": 0.8501099348068237, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5655, + "step": 23839 + }, + { + "epoch": 0.16737193802961087, + "grad_norm": 0.933879554271698, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5499, + "step": 23870 + }, + { + "epoch": 0.16758930418289608, + "grad_norm": 0.8791343569755554, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5594, + "step": 23901 + }, + { + "epoch": 0.1678066703361813, + "grad_norm": 0.92324298620224, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5581, + "step": 23932 + }, + { + "epoch": 0.1680240364894665, + "grad_norm": 0.9028039574623108, + "learning_rate": 6.044544397429958e-06, + "loss": 0.5572, + "step": 23963 + }, + { + "epoch": 0.16824140264275173, + "grad_norm": 0.9681089520454407, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5547, + "step": 23994 + }, + { + "epoch": 0.1684587687960369, + "grad_norm": 0.8708662390708923, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5578, + "step": 24025 + }, + { + "epoch": 0.16867613494932213, + "grad_norm": 0.9106061458587646, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5563, + "step": 24056 + }, + { + "epoch": 0.16889350110260734, + "grad_norm": 0.8620893359184265, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5528, + "step": 24087 + }, + { + "epoch": 0.16911086725589256, + "grad_norm": 0.8461076617240906, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5585, + "step": 24118 + }, + { + "epoch": 0.16932823340917777, + "grad_norm": 0.9462336301803589, + "learning_rate": 5.719877233394228e-06, + "loss": 0.5526, + "step": 24149 + }, + { + "epoch": 0.16954559956246296, + "grad_norm": 0.9454036355018616, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5496, + "step": 24180 + }, + { + "epoch": 0.16976296571574817, + "grad_norm": 0.9001603722572327, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5601, + "step": 24211 + }, + { + "epoch": 0.16998033186903339, + "grad_norm": 0.8742856383323669, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5638, + "step": 24242 + }, + { + "epoch": 0.1701976980223186, + "grad_norm": 0.8686881065368652, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5541, + "step": 24273 + }, + { + "epoch": 0.17041506417560381, + "grad_norm": 0.9935572147369385, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.5545, + "step": 24304 + }, + { + "epoch": 0.170632430328889, + "grad_norm": 0.8736170530319214, + "learning_rate": 5.403042459894597e-06, + "loss": 0.555, + "step": 24335 + }, + { + "epoch": 0.17084979648217422, + "grad_norm": 0.977024495601654, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5497, + "step": 24366 + }, + { + "epoch": 0.17106716263545943, + "grad_norm": 0.9486203789710999, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5542, + "step": 24397 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8013248630772728e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24416/training_args.bin b/checkpoint-24416/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-24416/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-27468/config.json b/checkpoint-27468/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-27468/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-27468/generation_config.json b/checkpoint-27468/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-27468/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-27468/model-00001-of-00007.safetensors b/checkpoint-27468/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ab7cd60e0be26b0b95be12a9bab41bdf43b11eb --- /dev/null +++ b/checkpoint-27468/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b38b10f8e21c6b6ca6569edd2a75cb055056f95ba08df904885119fbb59b47c +size 4886466168 diff --git a/checkpoint-27468/model-00002-of-00007.safetensors b/checkpoint-27468/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-27468/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-27468/model-00003-of-00007.safetensors b/checkpoint-27468/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-27468/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-27468/model-00004-of-00007.safetensors b/checkpoint-27468/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-27468/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-27468/model-00005-of-00007.safetensors b/checkpoint-27468/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-27468/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-27468/model-00006-of-00007.safetensors b/checkpoint-27468/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ccee7f9e59d692f75fdfa193add8ca883eb857f --- /dev/null +++ b/checkpoint-27468/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb257e58026304ef44e3e1d7b3374c34d21d44101ef51666de184f174ea3101 +size 4999813120 diff --git a/checkpoint-27468/model-00007-of-00007.safetensors b/checkpoint-27468/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d1113599499ba7874696320b92ffb986eb20d875 --- /dev/null +++ b/checkpoint-27468/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dd1ecbbebf484441ad93a34be45d39296ff4f753281b9dca5e99cb6054d88ec +size 2571158184 diff --git a/checkpoint-27468/model.safetensors.index.json b/checkpoint-27468/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-27468/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-27468/optimizer.pt b/checkpoint-27468/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..42bffaf35a278e76006efe70ef02e6fd1fd28556 --- /dev/null +++ b/checkpoint-27468/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af430695019cdc953b14ce2b13aabac17be38e5fd4e2ea08575999986910a9ba +size 15385036334 diff --git a/checkpoint-27468/rng_state.pth b/checkpoint-27468/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-27468/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-27468/scheduler.pt b/checkpoint-27468/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec6a33a1e16fa727d72d8610d56b97fd04ba15e3 --- /dev/null +++ b/checkpoint-27468/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d47b007e64bffbb0dc51c02560ea2fea14f1ab5035228332be1bd00a38697eb +size 1064 diff --git a/checkpoint-27468/trainer_state.json b/checkpoint-27468/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d30f5c447fe88335aa9d6f9cdfd00d93ea5244ba --- /dev/null +++ b/checkpoint-27468/trainer_state.json @@ -0,0 +1,6235 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1926004354334877, + "eval_steps": 500, + "global_step": 27468, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + }, + { + "epoch": 0.08564226439437232, + "grad_norm": 1.0365880727767944, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5954, + "step": 12214 + }, + { + "epoch": 0.08585963054765752, + "grad_norm": 1.0658811330795288, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5953, + "step": 12245 + }, + { + "epoch": 0.08607699670094274, + "grad_norm": 1.046661615371704, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5786, + "step": 12276 + }, + { + "epoch": 0.08629436285422795, + "grad_norm": 1.0220295190811157, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5879, + "step": 12307 + }, + { + "epoch": 0.08651172900751315, + "grad_norm": 1.0286898612976074, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5882, + "step": 12338 + }, + { + "epoch": 0.08672909516079837, + "grad_norm": 1.060483694076538, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5887, + "step": 12369 + }, + { + "epoch": 0.08694646131408357, + "grad_norm": 1.264788269996643, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5895, + "step": 12400 + }, + { + "epoch": 0.08716382746736878, + "grad_norm": 0.9751125574111938, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5846, + "step": 12431 + }, + { + "epoch": 0.087381193620654, + "grad_norm": 1.1198588609695435, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5878, + "step": 12462 + }, + { + "epoch": 0.0875985597739392, + "grad_norm": 1.0425771474838257, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5903, + "step": 12493 + }, + { + "epoch": 0.08781592592722441, + "grad_norm": 0.9820422530174255, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.5885, + "step": 12524 + }, + { + "epoch": 0.08803329208050961, + "grad_norm": 1.0234076976776123, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5967, + "step": 12555 + }, + { + "epoch": 0.08825065823379483, + "grad_norm": 0.9766228795051575, + "learning_rate": 3.409364314116074e-05, + "loss": 0.5923, + "step": 12586 + }, + { + "epoch": 0.08846802438708004, + "grad_norm": 0.9033060073852539, + "learning_rate": 3.401536249920559e-05, + "loss": 0.5892, + "step": 12617 + }, + { + "epoch": 0.08868539054036524, + "grad_norm": 1.1264934539794922, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5917, + "step": 12648 + }, + { + "epoch": 0.08890275669365046, + "grad_norm": 1.0052556991577148, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5885, + "step": 12679 + }, + { + "epoch": 0.08912012284693566, + "grad_norm": 1.0888159275054932, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5885, + "step": 12710 + }, + { + "epoch": 0.08933748900022087, + "grad_norm": 1.0458447933197021, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5923, + "step": 12741 + }, + { + "epoch": 0.08955485515350609, + "grad_norm": 1.055434226989746, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5889, + "step": 12772 + }, + { + "epoch": 0.08977222130679129, + "grad_norm": 1.000533103942871, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5833, + "step": 12803 + }, + { + "epoch": 0.0899895874600765, + "grad_norm": 1.0112768411636353, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5886, + "step": 12834 + }, + { + "epoch": 0.0902069536133617, + "grad_norm": 0.9883559942245483, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5886, + "step": 12865 + }, + { + "epoch": 0.09042431976664692, + "grad_norm": 1.0933095216751099, + "learning_rate": 3.330636493090868e-05, + "loss": 0.5873, + "step": 12896 + }, + { + "epoch": 0.09064168591993213, + "grad_norm": 1.1089582443237305, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5845, + "step": 12927 + }, + { + "epoch": 0.09085905207321733, + "grad_norm": 1.0311237573623657, + "learning_rate": 3.314775593945251e-05, + "loss": 0.588, + "step": 12958 + }, + { + "epoch": 0.09107641822650255, + "grad_norm": 1.0495847463607788, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5839, + "step": 12989 + }, + { + "epoch": 0.09129378437978775, + "grad_norm": 1.046921730041504, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5877, + "step": 13020 + }, + { + "epoch": 0.09151115053307296, + "grad_norm": 0.9927343130111694, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5898, + "step": 13051 + }, + { + "epoch": 0.09172851668635817, + "grad_norm": 0.9319870471954346, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5886, + "step": 13082 + }, + { + "epoch": 0.09194588283964338, + "grad_norm": 1.0408544540405273, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5946, + "step": 13113 + }, + { + "epoch": 0.09216324899292859, + "grad_norm": 1.0984251499176025, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5813, + "step": 13144 + }, + { + "epoch": 0.09238061514621379, + "grad_norm": 1.0376449823379517, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5855, + "step": 13175 + }, + { + "epoch": 0.092597981299499, + "grad_norm": 0.9627104997634888, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5846, + "step": 13206 + }, + { + "epoch": 0.09281534745278422, + "grad_norm": 1.2040653228759766, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5872, + "step": 13237 + }, + { + "epoch": 0.09303271360606942, + "grad_norm": 1.0091586112976074, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5911, + "step": 13268 + }, + { + "epoch": 0.09325007975935463, + "grad_norm": 1.0941787958145142, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5844, + "step": 13299 + }, + { + "epoch": 0.09346744591263983, + "grad_norm": 0.9720754027366638, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5898, + "step": 13330 + }, + { + "epoch": 0.09368481206592505, + "grad_norm": 0.8847430944442749, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5804, + "step": 13361 + }, + { + "epoch": 0.09390217821921026, + "grad_norm": 1.0070865154266357, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5916, + "step": 13392 + }, + { + "epoch": 0.09411954437249546, + "grad_norm": 0.9538979530334473, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5984, + "step": 13423 + }, + { + "epoch": 0.09433691052578068, + "grad_norm": 1.0414469242095947, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5828, + "step": 13454 + }, + { + "epoch": 0.0945542766790659, + "grad_norm": 1.0353257656097412, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5835, + "step": 13485 + }, + { + "epoch": 0.0947716428323511, + "grad_norm": 1.0454679727554321, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5909, + "step": 13516 + }, + { + "epoch": 0.09498900898563631, + "grad_norm": 1.246871829032898, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5829, + "step": 13547 + }, + { + "epoch": 0.09520637513892151, + "grad_norm": 1.0331645011901855, + "learning_rate": 3.154262717052985e-05, + "loss": 0.5919, + "step": 13578 + }, + { + "epoch": 0.09542374129220672, + "grad_norm": 0.8984120488166809, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5853, + "step": 13609 + }, + { + "epoch": 0.09564110744549194, + "grad_norm": 1.0582929849624634, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5828, + "step": 13640 + }, + { + "epoch": 0.09585847359877714, + "grad_norm": 1.151041865348816, + "learning_rate": 3.129913267924946e-05, + "loss": 0.582, + "step": 13671 + }, + { + "epoch": 0.09607583975206235, + "grad_norm": 0.9678122401237488, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5865, + "step": 13702 + }, + { + "epoch": 0.09629320590534755, + "grad_norm": 1.033832311630249, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5797, + "step": 13733 + }, + { + "epoch": 0.09651057205863277, + "grad_norm": 0.8917058706283569, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5829, + "step": 13764 + }, + { + "epoch": 0.09672793821191798, + "grad_norm": 0.9396641850471497, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5825, + "step": 13795 + }, + { + "epoch": 0.09694530436520318, + "grad_norm": 1.0871998071670532, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.5851, + "step": 13826 + }, + { + "epoch": 0.0971626705184884, + "grad_norm": 1.1403570175170898, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.5807, + "step": 13857 + }, + { + "epoch": 0.0973800366717736, + "grad_norm": 1.0246673822402954, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5795, + "step": 13888 + }, + { + "epoch": 0.09759740282505881, + "grad_norm": 0.9431213140487671, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5825, + "step": 13919 + }, + { + "epoch": 0.09781476897834403, + "grad_norm": 1.1028188467025757, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5885, + "step": 13950 + }, + { + "epoch": 0.09803213513162923, + "grad_norm": 1.125482201576233, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5849, + "step": 13981 + }, + { + "epoch": 0.09824950128491444, + "grad_norm": 0.9831985831260681, + "learning_rate": 3.040103481317539e-05, + "loss": 0.586, + "step": 14012 + }, + { + "epoch": 0.09846686743819964, + "grad_norm": 1.0038244724273682, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5894, + "step": 14043 + }, + { + "epoch": 0.09868423359148486, + "grad_norm": 0.8766770362854004, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5773, + "step": 14074 + }, + { + "epoch": 0.09890159974477007, + "grad_norm": 0.9358529448509216, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.5887, + "step": 14105 + }, + { + "epoch": 0.09911896589805527, + "grad_norm": 0.8988749384880066, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5815, + "step": 14136 + }, + { + "epoch": 0.09933633205134049, + "grad_norm": 1.08226478099823, + "learning_rate": 2.999029669712431e-05, + "loss": 0.5944, + "step": 14167 + }, + { + "epoch": 0.09955369820462569, + "grad_norm": 1.046980619430542, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5824, + "step": 14198 + }, + { + "epoch": 0.0997710643579109, + "grad_norm": 0.96205073595047, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5847, + "step": 14229 + }, + { + "epoch": 0.09998843051119612, + "grad_norm": 0.9914370775222778, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5824, + "step": 14260 + }, + { + "epoch": 0.10020579666448132, + "grad_norm": 0.922265887260437, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5907, + "step": 14291 + }, + { + "epoch": 0.10042316281776653, + "grad_norm": 0.971159815788269, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5806, + "step": 14322 + }, + { + "epoch": 0.10064052897105173, + "grad_norm": 1.1327799558639526, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5812, + "step": 14353 + }, + { + "epoch": 0.10085789512433695, + "grad_norm": 1.1632689237594604, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5813, + "step": 14384 + }, + { + "epoch": 0.10107526127762216, + "grad_norm": 0.9936057329177856, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.5803, + "step": 14415 + }, + { + "epoch": 0.10129262743090736, + "grad_norm": 1.1123740673065186, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5767, + "step": 14446 + }, + { + "epoch": 0.10150999358419258, + "grad_norm": 0.908643364906311, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5797, + "step": 14477 + }, + { + "epoch": 0.10172735973747778, + "grad_norm": 0.8761305809020996, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.589, + "step": 14508 + }, + { + "epoch": 0.10194472589076299, + "grad_norm": 1.0904477834701538, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.5796, + "step": 14539 + }, + { + "epoch": 0.1021620920440482, + "grad_norm": 1.060160756111145, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5837, + "step": 14570 + }, + { + "epoch": 0.1023794581973334, + "grad_norm": 1.0514498949050903, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5792, + "step": 14601 + }, + { + "epoch": 0.10259682435061862, + "grad_norm": 1.0152157545089722, + "learning_rate": 2.875010077160754e-05, + "loss": 0.574, + "step": 14632 + }, + { + "epoch": 0.10281419050390382, + "grad_norm": 0.974791944026947, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5819, + "step": 14663 + }, + { + "epoch": 0.10303155665718904, + "grad_norm": 0.9271855354309082, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5798, + "step": 14694 + }, + { + "epoch": 0.10324892281047425, + "grad_norm": 0.9933458566665649, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5796, + "step": 14725 + }, + { + "epoch": 0.10346628896375945, + "grad_norm": 0.9245687127113342, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5771, + "step": 14756 + }, + { + "epoch": 0.10368365511704467, + "grad_norm": 0.9483000040054321, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5825, + "step": 14787 + }, + { + "epoch": 0.10390102127032987, + "grad_norm": 0.9054533839225769, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5817, + "step": 14818 + }, + { + "epoch": 0.10411838742361508, + "grad_norm": 1.0200084447860718, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5788, + "step": 14849 + }, + { + "epoch": 0.1043357535769003, + "grad_norm": 0.9364084005355835, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5796, + "step": 14880 + }, + { + "epoch": 0.1045531197301855, + "grad_norm": 0.9658653736114502, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5825, + "step": 14911 + }, + { + "epoch": 0.10477048588347071, + "grad_norm": 0.8612831234931946, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5842, + "step": 14942 + }, + { + "epoch": 0.10498785203675591, + "grad_norm": 0.9673991799354553, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5776, + "step": 14973 + }, + { + "epoch": 0.10520521819004112, + "grad_norm": 1.092144250869751, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5785, + "step": 15004 + }, + { + "epoch": 0.10542258434332634, + "grad_norm": 1.0162559747695923, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5821, + "step": 15035 + }, + { + "epoch": 0.10563995049661154, + "grad_norm": 0.9578939080238342, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5781, + "step": 15066 + }, + { + "epoch": 0.10585731664989675, + "grad_norm": 0.9934547543525696, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5826, + "step": 15097 + }, + { + "epoch": 0.10607468280318195, + "grad_norm": 0.9542795419692993, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.583, + "step": 15128 + }, + { + "epoch": 0.10629204895646717, + "grad_norm": 0.9436337947845459, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5832, + "step": 15159 + }, + { + "epoch": 0.10650941510975238, + "grad_norm": 0.9200606942176819, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5766, + "step": 15190 + }, + { + "epoch": 0.10672678126303758, + "grad_norm": 1.0009740591049194, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5835, + "step": 15221 + }, + { + "epoch": 0.1069441474163228, + "grad_norm": 0.9597145915031433, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5728, + "step": 15252 + }, + { + "epoch": 0.107161513569608, + "grad_norm": 1.248867154121399, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5822, + "step": 15283 + }, + { + "epoch": 0.10737887972289321, + "grad_norm": 0.8899833559989929, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5843, + "step": 15314 + }, + { + "epoch": 0.10759624587617843, + "grad_norm": 1.0085718631744385, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5811, + "step": 15345 + }, + { + "epoch": 0.10781361202946363, + "grad_norm": 0.9277573227882385, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.5774, + "step": 15376 + }, + { + "epoch": 0.10803097818274884, + "grad_norm": 1.199010968208313, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5735, + "step": 15407 + }, + { + "epoch": 0.10824834433603404, + "grad_norm": 0.9361419081687927, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5834, + "step": 15438 + }, + { + "epoch": 0.10846571048931926, + "grad_norm": 1.05440092086792, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5721, + "step": 15469 + }, + { + "epoch": 0.10868307664260447, + "grad_norm": 1.0973948240280151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5838, + "step": 15500 + }, + { + "epoch": 0.10890044279588967, + "grad_norm": 0.9417588710784912, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5786, + "step": 15531 + }, + { + "epoch": 0.10911780894917489, + "grad_norm": 0.8763983845710754, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5766, + "step": 15562 + }, + { + "epoch": 0.10933517510246009, + "grad_norm": 1.0105509757995605, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.5896, + "step": 15593 + }, + { + "epoch": 0.1095525412557453, + "grad_norm": 0.9172413349151611, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5788, + "step": 15624 + }, + { + "epoch": 0.10976990740903052, + "grad_norm": 0.9640869498252869, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5791, + "step": 15655 + }, + { + "epoch": 0.10998727356231572, + "grad_norm": 1.0987275838851929, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.581, + "step": 15686 + }, + { + "epoch": 0.11020463971560093, + "grad_norm": 1.0418893098831177, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.569, + "step": 15717 + }, + { + "epoch": 0.11042200586888613, + "grad_norm": 1.0216400623321533, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5834, + "step": 15748 + }, + { + "epoch": 0.11063937202217135, + "grad_norm": 1.0211747884750366, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5733, + "step": 15779 + }, + { + "epoch": 0.11085673817545656, + "grad_norm": 0.9743130207061768, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5789, + "step": 15810 + }, + { + "epoch": 0.11107410432874176, + "grad_norm": 1.1765626668930054, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.57, + "step": 15841 + }, + { + "epoch": 0.11129147048202698, + "grad_norm": 0.9354963898658752, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5894, + "step": 15872 + }, + { + "epoch": 0.11150883663531218, + "grad_norm": 0.8743797540664673, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5716, + "step": 15903 + }, + { + "epoch": 0.11172620278859739, + "grad_norm": 1.1076644659042358, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5787, + "step": 15934 + }, + { + "epoch": 0.1119435689418826, + "grad_norm": 0.9156807065010071, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5764, + "step": 15965 + }, + { + "epoch": 0.11216093509516781, + "grad_norm": 1.0239089727401733, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5748, + "step": 15996 + }, + { + "epoch": 0.11237830124845302, + "grad_norm": 1.5095417499542236, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5815, + "step": 16027 + }, + { + "epoch": 0.11259566740173822, + "grad_norm": 0.9298838973045349, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5742, + "step": 16058 + }, + { + "epoch": 0.11281303355502344, + "grad_norm": 0.8956617712974548, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5856, + "step": 16089 + }, + { + "epoch": 0.11303039970830865, + "grad_norm": 0.950095534324646, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5786, + "step": 16120 + }, + { + "epoch": 0.11324776586159385, + "grad_norm": 1.0230988264083862, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5771, + "step": 16151 + }, + { + "epoch": 0.11346513201487907, + "grad_norm": 1.1018470525741577, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.57, + "step": 16182 + }, + { + "epoch": 0.11368249816816427, + "grad_norm": 0.9700168371200562, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.5689, + "step": 16213 + }, + { + "epoch": 0.11389986432144948, + "grad_norm": 0.9069929718971252, + "learning_rate": 2.439728136286796e-05, + "loss": 0.5719, + "step": 16244 + }, + { + "epoch": 0.1141172304747347, + "grad_norm": 0.9254815578460693, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5704, + "step": 16275 + }, + { + "epoch": 0.1143345966280199, + "grad_norm": 0.9150753021240234, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5754, + "step": 16306 + }, + { + "epoch": 0.11455196278130511, + "grad_norm": 1.003201961517334, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5746, + "step": 16337 + }, + { + "epoch": 0.11476932893459031, + "grad_norm": 1.1016685962677002, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5763, + "step": 16368 + }, + { + "epoch": 0.11498669508787553, + "grad_norm": 1.0079994201660156, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5721, + "step": 16399 + }, + { + "epoch": 0.11520406124116074, + "grad_norm": 0.989470899105072, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5795, + "step": 16430 + }, + { + "epoch": 0.11542142739444594, + "grad_norm": 1.039035439491272, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5737, + "step": 16461 + }, + { + "epoch": 0.11563879354773116, + "grad_norm": 0.8659546971321106, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5711, + "step": 16492 + }, + { + "epoch": 0.11585615970101636, + "grad_norm": 0.9558688998222351, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5763, + "step": 16523 + }, + { + "epoch": 0.11607352585430157, + "grad_norm": 1.0017194747924805, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5772, + "step": 16554 + }, + { + "epoch": 0.11629089200758679, + "grad_norm": 1.0045812129974365, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5744, + "step": 16585 + }, + { + "epoch": 0.11650825816087199, + "grad_norm": 0.8719637393951416, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5681, + "step": 16616 + }, + { + "epoch": 0.1167256243141572, + "grad_norm": 0.9029743075370789, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5807, + "step": 16647 + }, + { + "epoch": 0.1169429904674424, + "grad_norm": 0.9439691305160522, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5751, + "step": 16678 + }, + { + "epoch": 0.11716035662072762, + "grad_norm": 0.900688648223877, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5772, + "step": 16709 + }, + { + "epoch": 0.11737772277401283, + "grad_norm": 0.8884438872337341, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5758, + "step": 16740 + }, + { + "epoch": 0.11759508892729803, + "grad_norm": 0.9252585172653198, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5705, + "step": 16771 + }, + { + "epoch": 0.11781245508058324, + "grad_norm": 0.9447957873344421, + "learning_rate": 2.288805948824212e-05, + "loss": 0.566, + "step": 16802 + }, + { + "epoch": 0.11802982123386845, + "grad_norm": 0.9666566252708435, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5745, + "step": 16833 + }, + { + "epoch": 0.11824718738715366, + "grad_norm": 0.9459251761436462, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.5775, + "step": 16864 + }, + { + "epoch": 0.11846455354043887, + "grad_norm": 0.8863123059272766, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5716, + "step": 16895 + }, + { + "epoch": 0.11868191969372408, + "grad_norm": 0.9847676753997803, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5763, + "step": 16926 + }, + { + "epoch": 0.11889928584700929, + "grad_norm": 1.1111658811569214, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5752, + "step": 16957 + }, + { + "epoch": 0.11911665200029449, + "grad_norm": 1.0046541690826416, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.574, + "step": 16988 + }, + { + "epoch": 0.1193340181535797, + "grad_norm": 0.8580814599990845, + "learning_rate": 2.230292185905114e-05, + "loss": 0.572, + "step": 17019 + }, + { + "epoch": 0.11955138430686492, + "grad_norm": 0.9188304543495178, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.571, + "step": 17050 + }, + { + "epoch": 0.11976875046015012, + "grad_norm": 0.9079185724258423, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5792, + "step": 17081 + }, + { + "epoch": 0.11998611661343533, + "grad_norm": 0.9194979071617126, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5753, + "step": 17112 + }, + { + "epoch": 0.12020348276672053, + "grad_norm": 0.8398452997207642, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5653, + "step": 17143 + }, + { + "epoch": 0.12042084892000575, + "grad_norm": 0.9888772368431091, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5747, + "step": 17174 + }, + { + "epoch": 0.12063821507329096, + "grad_norm": 0.9137700796127319, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5728, + "step": 17205 + }, + { + "epoch": 0.12085558122657616, + "grad_norm": 1.058064579963684, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5717, + "step": 17236 + }, + { + "epoch": 0.12107294737986138, + "grad_norm": 0.9835705757141113, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.5725, + "step": 17267 + }, + { + "epoch": 0.12129031353314658, + "grad_norm": 0.918863832950592, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5676, + "step": 17298 + }, + { + "epoch": 0.1215076796864318, + "grad_norm": 0.9384900331497192, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5768, + "step": 17329 + }, + { + "epoch": 0.12172504583971701, + "grad_norm": 1.060088038444519, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5642, + "step": 17360 + }, + { + "epoch": 0.12194241199300221, + "grad_norm": 0.999266505241394, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5669, + "step": 17391 + }, + { + "epoch": 0.12215977814628742, + "grad_norm": 0.8633130788803101, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5661, + "step": 17422 + }, + { + "epoch": 0.12237714429957262, + "grad_norm": 0.9396159052848816, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5743, + "step": 17453 + }, + { + "epoch": 0.12259451045285784, + "grad_norm": 0.9990928173065186, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5707, + "step": 17484 + }, + { + "epoch": 0.12281187660614305, + "grad_norm": 0.9732767939567566, + "learning_rate": 2.097158366805287e-05, + "loss": 0.571, + "step": 17515 + }, + { + "epoch": 0.12302924275942825, + "grad_norm": 20.362672805786133, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5586, + "step": 17546 + }, + { + "epoch": 0.12324660891271347, + "grad_norm": 0.976889431476593, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5693, + "step": 17577 + }, + { + "epoch": 0.12346397506599867, + "grad_norm": 0.907172679901123, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5714, + "step": 17608 + }, + { + "epoch": 0.12368134121928388, + "grad_norm": 0.8816654086112976, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5669, + "step": 17639 + }, + { + "epoch": 0.1238987073725691, + "grad_norm": 0.9616197943687439, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.5739, + "step": 17670 + }, + { + "epoch": 0.1241160735258543, + "grad_norm": 0.9188937544822693, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5689, + "step": 17701 + }, + { + "epoch": 0.12433343967913951, + "grad_norm": 0.9845620393753052, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5716, + "step": 17732 + }, + { + "epoch": 0.12455080583242471, + "grad_norm": 0.8922098278999329, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5648, + "step": 17763 + }, + { + "epoch": 0.12476817198570993, + "grad_norm": 0.9715010523796082, + "learning_rate": 2.022757379528727e-05, + "loss": 0.5664, + "step": 17794 + }, + { + "epoch": 0.12498553813899514, + "grad_norm": 1.0769156217575073, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5689, + "step": 17825 + }, + { + "epoch": 0.12520290429228034, + "grad_norm": 0.9304386973381042, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5772, + "step": 17856 + }, + { + "epoch": 0.12542027044556556, + "grad_norm": 1.0523558855056763, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5686, + "step": 17887 + }, + { + "epoch": 0.12563763659885077, + "grad_norm": 1.029064655303955, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5629, + "step": 17918 + }, + { + "epoch": 0.12585500275213599, + "grad_norm": 1.0367600917816162, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5682, + "step": 17949 + }, + { + "epoch": 0.12607236890542117, + "grad_norm": 1.047844648361206, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.571, + "step": 17980 + }, + { + "epoch": 0.1262897350587064, + "grad_norm": 0.9374393820762634, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5731, + "step": 18011 + }, + { + "epoch": 0.1265071012119916, + "grad_norm": 1.0163381099700928, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.5769, + "step": 18042 + }, + { + "epoch": 0.12672446736527682, + "grad_norm": 0.9243590235710144, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5697, + "step": 18073 + }, + { + "epoch": 0.12694183351856203, + "grad_norm": 1.0359089374542236, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5639, + "step": 18104 + }, + { + "epoch": 0.12715919967184722, + "grad_norm": 0.841151773929596, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5792, + "step": 18135 + }, + { + "epoch": 0.12737656582513243, + "grad_norm": 1.0070539712905884, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5669, + "step": 18166 + }, + { + "epoch": 0.12759393197841765, + "grad_norm": 0.9453309178352356, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5683, + "step": 18197 + }, + { + "epoch": 0.12781129813170286, + "grad_norm": 0.9628680348396301, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5711, + "step": 18228 + }, + { + "epoch": 0.12802866428498808, + "grad_norm": 0.9396767616271973, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5709, + "step": 18259 + }, + { + "epoch": 0.12824603043827326, + "grad_norm": 0.9093485474586487, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5689, + "step": 18290 + }, + { + "epoch": 0.12846339659155848, + "grad_norm": 0.8730084896087646, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5744, + "step": 18321 + }, + { + "epoch": 0.1286807627448437, + "grad_norm": 0.9706755876541138, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5572, + "step": 18352 + }, + { + "epoch": 0.1288981288981289, + "grad_norm": 0.9472910165786743, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5726, + "step": 18383 + }, + { + "epoch": 0.12911549505141412, + "grad_norm": 0.9355587959289551, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5673, + "step": 18414 + }, + { + "epoch": 0.1293328612046993, + "grad_norm": 0.9303567409515381, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5616, + "step": 18445 + }, + { + "epoch": 0.12955022735798452, + "grad_norm": 0.9067112803459167, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5661, + "step": 18476 + }, + { + "epoch": 0.12976759351126974, + "grad_norm": 0.899079442024231, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5726, + "step": 18507 + }, + { + "epoch": 0.12998495966455495, + "grad_norm": 0.8478329181671143, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5633, + "step": 18538 + }, + { + "epoch": 0.13020232581784016, + "grad_norm": 0.910685122013092, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5683, + "step": 18569 + }, + { + "epoch": 0.13041969197112535, + "grad_norm": 0.9179863333702087, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5753, + "step": 18600 + }, + { + "epoch": 0.13063705812441057, + "grad_norm": 0.9042870402336121, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5752, + "step": 18631 + }, + { + "epoch": 0.13085442427769578, + "grad_norm": 0.9494644999504089, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5635, + "step": 18662 + }, + { + "epoch": 0.131071790430981, + "grad_norm": 0.9707177877426147, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5623, + "step": 18693 + }, + { + "epoch": 0.1312891565842662, + "grad_norm": 0.9590293169021606, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5799, + "step": 18724 + }, + { + "epoch": 0.1315065227375514, + "grad_norm": 0.9343449473381042, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5757, + "step": 18755 + }, + { + "epoch": 0.1317238888908366, + "grad_norm": 0.9229467511177063, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5644, + "step": 18786 + }, + { + "epoch": 0.13194125504412182, + "grad_norm": 0.9312314987182617, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5693, + "step": 18817 + }, + { + "epoch": 0.13215862119740704, + "grad_norm": 0.8548254370689392, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5713, + "step": 18848 + }, + { + "epoch": 0.13237598735069225, + "grad_norm": 1.0379942655563354, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5696, + "step": 18879 + }, + { + "epoch": 0.13259335350397744, + "grad_norm": 1.0847291946411133, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5656, + "step": 18910 + }, + { + "epoch": 0.13281071965726265, + "grad_norm": 0.969327449798584, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.5654, + "step": 18941 + }, + { + "epoch": 0.13302808581054787, + "grad_norm": 0.9928266406059265, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5635, + "step": 18972 + }, + { + "epoch": 0.13324545196383308, + "grad_norm": 0.8415375351905823, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.572, + "step": 19003 + }, + { + "epoch": 0.1334628181171183, + "grad_norm": 0.9909110069274902, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5727, + "step": 19034 + }, + { + "epoch": 0.13368018427040348, + "grad_norm": 1.0183087587356567, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5767, + "step": 19065 + }, + { + "epoch": 0.1338975504236887, + "grad_norm": 0.9055935144424438, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5628, + "step": 19096 + }, + { + "epoch": 0.1341149165769739, + "grad_norm": 0.8832345008850098, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5688, + "step": 19127 + }, + { + "epoch": 0.13433228273025913, + "grad_norm": 1.1259726285934448, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5578, + "step": 19158 + }, + { + "epoch": 0.13454964888354434, + "grad_norm": 0.9167343378067017, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.5636, + "step": 19189 + }, + { + "epoch": 0.13476701503682953, + "grad_norm": 0.9861068725585938, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5681, + "step": 19220 + }, + { + "epoch": 0.13498438119011474, + "grad_norm": 0.9800103306770325, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5689, + "step": 19251 + }, + { + "epoch": 0.13520174734339996, + "grad_norm": 0.9900636672973633, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5625, + "step": 19282 + }, + { + "epoch": 0.13541911349668517, + "grad_norm": 0.9756057858467102, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5634, + "step": 19313 + }, + { + "epoch": 0.1356364796499704, + "grad_norm": 0.9184322953224182, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5713, + "step": 19344 + }, + { + "epoch": 0.13585384580325557, + "grad_norm": 1.003735065460205, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5626, + "step": 19375 + }, + { + "epoch": 0.1360712119565408, + "grad_norm": 0.8933300375938416, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5705, + "step": 19406 + }, + { + "epoch": 0.136288578109826, + "grad_norm": 0.997909426689148, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5645, + "step": 19437 + }, + { + "epoch": 0.13650594426311122, + "grad_norm": 0.9039232730865479, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5551, + "step": 19468 + }, + { + "epoch": 0.13672331041639643, + "grad_norm": 0.9416874647140503, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5621, + "step": 19499 + }, + { + "epoch": 0.13694067656968162, + "grad_norm": 0.8743234872817993, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5692, + "step": 19530 + }, + { + "epoch": 0.13715804272296683, + "grad_norm": 1.0159176588058472, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5622, + "step": 19561 + }, + { + "epoch": 0.13737540887625205, + "grad_norm": 0.8633915781974792, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5688, + "step": 19592 + }, + { + "epoch": 0.13759277502953726, + "grad_norm": 0.9839888215065002, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.5691, + "step": 19623 + }, + { + "epoch": 0.13781014118282248, + "grad_norm": 1.0715723037719727, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5582, + "step": 19654 + }, + { + "epoch": 0.13802750733610766, + "grad_norm": 1.029173493385315, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5678, + "step": 19685 + }, + { + "epoch": 0.13824487348939288, + "grad_norm": 1.1011470556259155, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5692, + "step": 19716 + }, + { + "epoch": 0.1384622396426781, + "grad_norm": 0.9993789196014404, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5645, + "step": 19747 + }, + { + "epoch": 0.1386796057959633, + "grad_norm": 1.0202093124389648, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5703, + "step": 19778 + }, + { + "epoch": 0.13889697194924852, + "grad_norm": 1.0126008987426758, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5639, + "step": 19809 + }, + { + "epoch": 0.1391143381025337, + "grad_norm": 1.0468281507492065, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5683, + "step": 19840 + }, + { + "epoch": 0.13933170425581892, + "grad_norm": 0.9329802393913269, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.5597, + "step": 19871 + }, + { + "epoch": 0.13954907040910414, + "grad_norm": 0.891503632068634, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5728, + "step": 19902 + }, + { + "epoch": 0.13976643656238935, + "grad_norm": 0.9752770662307739, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.564, + "step": 19933 + }, + { + "epoch": 0.13998380271567457, + "grad_norm": 0.8956452012062073, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5661, + "step": 19964 + }, + { + "epoch": 0.14020116886895975, + "grad_norm": 1.072753667831421, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.562, + "step": 19995 + }, + { + "epoch": 0.14041853502224497, + "grad_norm": 0.8971157670021057, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5613, + "step": 20026 + }, + { + "epoch": 0.14063590117553018, + "grad_norm": 0.8919452428817749, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5659, + "step": 20057 + }, + { + "epoch": 0.1408532673288154, + "grad_norm": 0.9752078056335449, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5687, + "step": 20088 + }, + { + "epoch": 0.1410706334821006, + "grad_norm": 0.9520591497421265, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.5673, + "step": 20119 + }, + { + "epoch": 0.1412879996353858, + "grad_norm": 0.8892295956611633, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.5705, + "step": 20150 + }, + { + "epoch": 0.141505365788671, + "grad_norm": 0.9576200842857361, + "learning_rate": 1.410916653306954e-05, + "loss": 0.5667, + "step": 20181 + }, + { + "epoch": 0.14172273194195623, + "grad_norm": 0.9564182162284851, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5595, + "step": 20212 + }, + { + "epoch": 0.14194009809524144, + "grad_norm": 0.9247251749038696, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.5709, + "step": 20243 + }, + { + "epoch": 0.14215746424852665, + "grad_norm": 0.9523617625236511, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5591, + "step": 20274 + }, + { + "epoch": 0.14237483040181184, + "grad_norm": 0.9751485586166382, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5678, + "step": 20305 + }, + { + "epoch": 0.14259219655509706, + "grad_norm": 1.0090728998184204, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5569, + "step": 20336 + }, + { + "epoch": 0.14280956270838227, + "grad_norm": 0.8991780281066895, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5611, + "step": 20367 + }, + { + "epoch": 0.14302692886166748, + "grad_norm": 0.8665379285812378, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5639, + "step": 20398 + }, + { + "epoch": 0.1432442950149527, + "grad_norm": 0.9348465204238892, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.5582, + "step": 20429 + }, + { + "epoch": 0.1434616611682379, + "grad_norm": 0.8632979989051819, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5672, + "step": 20460 + }, + { + "epoch": 0.1436790273215231, + "grad_norm": 0.9019519686698914, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5642, + "step": 20491 + }, + { + "epoch": 0.14389639347480832, + "grad_norm": 0.8994531035423279, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5663, + "step": 20522 + }, + { + "epoch": 0.14411375962809353, + "grad_norm": 0.9270524978637695, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5643, + "step": 20553 + }, + { + "epoch": 0.14433112578137874, + "grad_norm": 0.8957355618476868, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5632, + "step": 20584 + }, + { + "epoch": 0.14454849193466393, + "grad_norm": 1.0234413146972656, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5647, + "step": 20615 + }, + { + "epoch": 0.14476585808794915, + "grad_norm": 0.8956789970397949, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.5635, + "step": 20646 + }, + { + "epoch": 0.14498322424123436, + "grad_norm": 0.883823037147522, + "learning_rate": 1.291596270869846e-05, + "loss": 0.5702, + "step": 20677 + }, + { + "epoch": 0.14520059039451957, + "grad_norm": 0.8809013366699219, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5641, + "step": 20708 + }, + { + "epoch": 0.1454179565478048, + "grad_norm": 0.9803751707077026, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5604, + "step": 20739 + }, + { + "epoch": 0.14563532270108998, + "grad_norm": 0.8637491464614868, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.558, + "step": 20770 + }, + { + "epoch": 0.1458526888543752, + "grad_norm": 0.8922715187072754, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5619, + "step": 20801 + }, + { + "epoch": 0.1460700550076604, + "grad_norm": 0.9750674366950989, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5664, + "step": 20832 + }, + { + "epoch": 0.14628742116094562, + "grad_norm": 1.0473570823669434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.5646, + "step": 20863 + }, + { + "epoch": 0.14650478731423083, + "grad_norm": 1.130385160446167, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5617, + "step": 20894 + }, + { + "epoch": 0.14672215346751602, + "grad_norm": 0.9984387755393982, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.567, + "step": 20925 + }, + { + "epoch": 0.14693951962080123, + "grad_norm": 0.9383957982063293, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5567, + "step": 20956 + }, + { + "epoch": 0.14715688577408645, + "grad_norm": 0.981935977935791, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5651, + "step": 20987 + }, + { + "epoch": 0.14737425192737166, + "grad_norm": 0.9774724841117859, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.5652, + "step": 21018 + }, + { + "epoch": 0.14759161808065688, + "grad_norm": 0.9714674949645996, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.561, + "step": 21049 + }, + { + "epoch": 0.14780898423394206, + "grad_norm": 0.8881489038467407, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5594, + "step": 21080 + }, + { + "epoch": 0.14802635038722728, + "grad_norm": 0.961926281452179, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5611, + "step": 21111 + }, + { + "epoch": 0.1482437165405125, + "grad_norm": 0.9101502895355225, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5628, + "step": 21142 + }, + { + "epoch": 0.1484610826937977, + "grad_norm": 0.9001050591468811, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5576, + "step": 21173 + }, + { + "epoch": 0.14867844884708292, + "grad_norm": 0.9724435210227966, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.568, + "step": 21204 + }, + { + "epoch": 0.1488958150003681, + "grad_norm": 0.825156569480896, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.5634, + "step": 21235 + }, + { + "epoch": 0.14911318115365332, + "grad_norm": 0.9625114798545837, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5667, + "step": 21266 + }, + { + "epoch": 0.14933054730693854, + "grad_norm": 1.0243901014328003, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5629, + "step": 21297 + }, + { + "epoch": 0.14954791346022375, + "grad_norm": 0.9247808456420898, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.563, + "step": 21328 + }, + { + "epoch": 0.14976527961350897, + "grad_norm": 0.8996061682701111, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5644, + "step": 21359 + }, + { + "epoch": 0.14998264576679415, + "grad_norm": 0.9766656160354614, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.566, + "step": 21390 + }, + { + "epoch": 0.15020001192007937, + "grad_norm": 0.9848279356956482, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5648, + "step": 21421 + }, + { + "epoch": 0.15041737807336458, + "grad_norm": 0.972819447517395, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5573, + "step": 21452 + }, + { + "epoch": 0.1506347442266498, + "grad_norm": 0.8826684951782227, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5625, + "step": 21483 + }, + { + "epoch": 0.150852110379935, + "grad_norm": 0.9768727421760559, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5609, + "step": 21514 + }, + { + "epoch": 0.1510694765332202, + "grad_norm": 0.9463690519332886, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5605, + "step": 21545 + }, + { + "epoch": 0.1512868426865054, + "grad_norm": 0.9010226130485535, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5626, + "step": 21576 + }, + { + "epoch": 0.15150420883979063, + "grad_norm": 0.9793362617492676, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5629, + "step": 21607 + }, + { + "epoch": 0.15172157499307584, + "grad_norm": 0.8516845107078552, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5655, + "step": 21638 + }, + { + "epoch": 0.15193894114636106, + "grad_norm": 0.9344280958175659, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5678, + "step": 21669 + }, + { + "epoch": 0.15215630729964624, + "grad_norm": 0.9141379594802856, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5651, + "step": 21700 + }, + { + "epoch": 0.15237367345293146, + "grad_norm": 0.9709919691085815, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5534, + "step": 21731 + }, + { + "epoch": 0.15259103960621667, + "grad_norm": 0.9237218499183655, + "learning_rate": 1.043211714185722e-05, + "loss": 0.5615, + "step": 21762 + }, + { + "epoch": 0.1528084057595019, + "grad_norm": 0.8852784633636475, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5576, + "step": 21793 + }, + { + "epoch": 0.1530257719127871, + "grad_norm": 0.8654377460479736, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.5583, + "step": 21824 + }, + { + "epoch": 0.1532431380660723, + "grad_norm": 1.0151047706604004, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.5626, + "step": 21855 + }, + { + "epoch": 0.1534605042193575, + "grad_norm": 1.0742695331573486, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5546, + "step": 21886 + }, + { + "epoch": 0.15367787037264272, + "grad_norm": 0.9627267718315125, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5701, + "step": 21917 + }, + { + "epoch": 0.15389523652592793, + "grad_norm": 0.9896987080574036, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5573, + "step": 21948 + }, + { + "epoch": 0.15411260267921315, + "grad_norm": 0.8868485689163208, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5553, + "step": 21979 + }, + { + "epoch": 0.15432996883249833, + "grad_norm": 0.8650690317153931, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5638, + "step": 22010 + }, + { + "epoch": 0.15454733498578355, + "grad_norm": 0.8827528357505798, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5618, + "step": 22041 + }, + { + "epoch": 0.15476470113906876, + "grad_norm": 0.9161486625671387, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5547, + "step": 22072 + }, + { + "epoch": 0.15498206729235398, + "grad_norm": 0.8655954599380493, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5575, + "step": 22103 + }, + { + "epoch": 0.1551994334456392, + "grad_norm": 1.1528652906417847, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5562, + "step": 22134 + }, + { + "epoch": 0.15541679959892438, + "grad_norm": 0.9214157462120056, + "learning_rate": 9.559031610258007e-06, + "loss": 0.5505, + "step": 22165 + }, + { + "epoch": 0.1556341657522096, + "grad_norm": 0.9822834730148315, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5651, + "step": 22196 + }, + { + "epoch": 0.1558515319054948, + "grad_norm": 1.0093454122543335, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5629, + "step": 22227 + }, + { + "epoch": 0.15606889805878002, + "grad_norm": 0.9251008629798889, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5644, + "step": 22258 + }, + { + "epoch": 0.15628626421206523, + "grad_norm": 0.9593933820724487, + "learning_rate": 9.29623762843734e-06, + "loss": 0.5626, + "step": 22289 + }, + { + "epoch": 0.15650363036535042, + "grad_norm": 0.9322303533554077, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5549, + "step": 22320 + }, + { + "epoch": 0.15672099651863564, + "grad_norm": 1.0490275621414185, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5527, + "step": 22351 + }, + { + "epoch": 0.15693836267192085, + "grad_norm": 0.9138365387916565, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5609, + "step": 22382 + }, + { + "epoch": 0.15715572882520606, + "grad_norm": 0.9704885482788086, + "learning_rate": 9.036279043803565e-06, + "loss": 0.5559, + "step": 22413 + }, + { + "epoch": 0.15737309497849128, + "grad_norm": 0.9594223499298096, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5533, + "step": 22444 + }, + { + "epoch": 0.15759046113177647, + "grad_norm": 0.9496825933456421, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5631, + "step": 22475 + }, + { + "epoch": 0.15780782728506168, + "grad_norm": 0.8646016120910645, + "learning_rate": 8.843199089695293e-06, + "loss": 0.561, + "step": 22506 + }, + { + "epoch": 0.1580251934383469, + "grad_norm": 0.8263002038002014, + "learning_rate": 8.779202793251311e-06, + "loss": 0.555, + "step": 22537 + }, + { + "epoch": 0.1582425595916321, + "grad_norm": 0.9070886969566345, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5577, + "step": 22568 + }, + { + "epoch": 0.15845992574491732, + "grad_norm": 0.8829283118247986, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5608, + "step": 22599 + }, + { + "epoch": 0.1586772918982025, + "grad_norm": 0.8605303764343262, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5505, + "step": 22630 + }, + { + "epoch": 0.15889465805148772, + "grad_norm": 0.9638768434524536, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5614, + "step": 22661 + }, + { + "epoch": 0.15911202420477294, + "grad_norm": 0.908811628818512, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5606, + "step": 22692 + }, + { + "epoch": 0.15932939035805815, + "grad_norm": 0.9718073010444641, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5615, + "step": 22723 + }, + { + "epoch": 0.15954675651134337, + "grad_norm": 0.9598197937011719, + "learning_rate": 8.336394285228017e-06, + "loss": 0.5512, + "step": 22754 + }, + { + "epoch": 0.15976412266462856, + "grad_norm": 0.8870306611061096, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5565, + "step": 22785 + }, + { + "epoch": 0.15998148881791377, + "grad_norm": 0.7993106842041016, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5529, + "step": 22816 + }, + { + "epoch": 0.16019885497119898, + "grad_norm": 0.8731540441513062, + "learning_rate": 8.149425683199823e-06, + "loss": 0.5586, + "step": 22847 + }, + { + "epoch": 0.1604162211244842, + "grad_norm": 0.8460251688957214, + "learning_rate": 8.08748219313325e-06, + "loss": 0.5583, + "step": 22878 + }, + { + "epoch": 0.1606335872777694, + "grad_norm": 0.9626048803329468, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5558, + "step": 22909 + }, + { + "epoch": 0.1608509534310546, + "grad_norm": 0.9286885261535645, + "learning_rate": 7.964168476264508e-06, + "loss": 0.5611, + "step": 22940 + }, + { + "epoch": 0.16106831958433981, + "grad_norm": 0.8666262030601501, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5646, + "step": 22971 + }, + { + "epoch": 0.16128568573762503, + "grad_norm": 0.9536890387535095, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5562, + "step": 23002 + }, + { + "epoch": 0.16150305189091024, + "grad_norm": 0.9607664942741394, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5623, + "step": 23033 + }, + { + "epoch": 0.16172041804419546, + "grad_norm": 0.9009374380111694, + "learning_rate": 7.719853532125227e-06, + "loss": 0.5578, + "step": 23064 + }, + { + "epoch": 0.16193778419748064, + "grad_norm": 0.7630103230476379, + "learning_rate": 7.65926058865258e-06, + "loss": 0.5601, + "step": 23095 + }, + { + "epoch": 0.16215515035076586, + "grad_norm": 0.8619088530540466, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5571, + "step": 23126 + }, + { + "epoch": 0.16237251650405107, + "grad_norm": 1.0103073120117188, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5587, + "step": 23157 + }, + { + "epoch": 0.1625898826573363, + "grad_norm": 0.9380328059196472, + "learning_rate": 7.478658609642211e-06, + "loss": 0.555, + "step": 23188 + }, + { + "epoch": 0.1628072488106215, + "grad_norm": 0.9486220479011536, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5658, + "step": 23219 + }, + { + "epoch": 0.1630246149639067, + "grad_norm": 0.9146499633789062, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5503, + "step": 23250 + }, + { + "epoch": 0.1632419811171919, + "grad_norm": 0.911389946937561, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5584, + "step": 23281 + }, + { + "epoch": 0.16345934727047712, + "grad_norm": 0.8847711086273193, + "learning_rate": 7.240627257831847e-06, + "loss": 0.5494, + "step": 23312 + }, + { + "epoch": 0.16367671342376233, + "grad_norm": 0.9155315160751343, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.548, + "step": 23343 + }, + { + "epoch": 0.16389407957704755, + "grad_norm": 0.8847165703773499, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5512, + "step": 23374 + }, + { + "epoch": 0.16411144573033273, + "grad_norm": 1.0043821334838867, + "learning_rate": 7.064205712766226e-06, + "loss": 0.554, + "step": 23405 + }, + { + "epoch": 0.16432881188361795, + "grad_norm": 0.9789336323738098, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5579, + "step": 23436 + }, + { + "epoch": 0.16454617803690316, + "grad_norm": 0.8675613403320312, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5547, + "step": 23467 + }, + { + "epoch": 0.16476354419018838, + "grad_norm": 1.0360661745071411, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5575, + "step": 23498 + }, + { + "epoch": 0.1649809103434736, + "grad_norm": 0.9654151201248169, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5575, + "step": 23529 + }, + { + "epoch": 0.16519827649675878, + "grad_norm": 0.886508584022522, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5552, + "step": 23560 + }, + { + "epoch": 0.165415642650044, + "grad_norm": 0.8399243950843811, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5494, + "step": 23591 + }, + { + "epoch": 0.1656330088033292, + "grad_norm": 0.876013457775116, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5589, + "step": 23622 + }, + { + "epoch": 0.16585037495661442, + "grad_norm": 0.8546695709228516, + "learning_rate": 6.602702003200872e-06, + "loss": 0.5558, + "step": 23653 + }, + { + "epoch": 0.16606774110989964, + "grad_norm": 0.8829993605613708, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5605, + "step": 23684 + }, + { + "epoch": 0.16628510726318482, + "grad_norm": 0.8759157657623291, + "learning_rate": 6.489389252651057e-06, + "loss": 0.5546, + "step": 23715 + }, + { + "epoch": 0.16650247341647004, + "grad_norm": 0.9579117894172668, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.5507, + "step": 23746 + }, + { + "epoch": 0.16671983956975525, + "grad_norm": 0.9086149334907532, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5594, + "step": 23777 + }, + { + "epoch": 0.16693720572304047, + "grad_norm": 0.889070451259613, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5591, + "step": 23808 + }, + { + "epoch": 0.16715457187632568, + "grad_norm": 0.8501099348068237, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5655, + "step": 23839 + }, + { + "epoch": 0.16737193802961087, + "grad_norm": 0.933879554271698, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5499, + "step": 23870 + }, + { + "epoch": 0.16758930418289608, + "grad_norm": 0.8791343569755554, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5594, + "step": 23901 + }, + { + "epoch": 0.1678066703361813, + "grad_norm": 0.92324298620224, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5581, + "step": 23932 + }, + { + "epoch": 0.1680240364894665, + "grad_norm": 0.9028039574623108, + "learning_rate": 6.044544397429958e-06, + "loss": 0.5572, + "step": 23963 + }, + { + "epoch": 0.16824140264275173, + "grad_norm": 0.9681089520454407, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5547, + "step": 23994 + }, + { + "epoch": 0.1684587687960369, + "grad_norm": 0.8708662390708923, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5578, + "step": 24025 + }, + { + "epoch": 0.16867613494932213, + "grad_norm": 0.9106061458587646, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5563, + "step": 24056 + }, + { + "epoch": 0.16889350110260734, + "grad_norm": 0.8620893359184265, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5528, + "step": 24087 + }, + { + "epoch": 0.16911086725589256, + "grad_norm": 0.8461076617240906, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5585, + "step": 24118 + }, + { + "epoch": 0.16932823340917777, + "grad_norm": 0.9462336301803589, + "learning_rate": 5.719877233394228e-06, + "loss": 0.5526, + "step": 24149 + }, + { + "epoch": 0.16954559956246296, + "grad_norm": 0.9454036355018616, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5496, + "step": 24180 + }, + { + "epoch": 0.16976296571574817, + "grad_norm": 0.9001603722572327, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5601, + "step": 24211 + }, + { + "epoch": 0.16998033186903339, + "grad_norm": 0.8742856383323669, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5638, + "step": 24242 + }, + { + "epoch": 0.1701976980223186, + "grad_norm": 0.8686881065368652, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5541, + "step": 24273 + }, + { + "epoch": 0.17041506417560381, + "grad_norm": 0.9935572147369385, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.5545, + "step": 24304 + }, + { + "epoch": 0.170632430328889, + "grad_norm": 0.8736170530319214, + "learning_rate": 5.403042459894597e-06, + "loss": 0.555, + "step": 24335 + }, + { + "epoch": 0.17084979648217422, + "grad_norm": 0.977024495601654, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5497, + "step": 24366 + }, + { + "epoch": 0.17106716263545943, + "grad_norm": 0.9486203789710999, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5542, + "step": 24397 + }, + { + "epoch": 0.17128452878874464, + "grad_norm": 0.9372376203536987, + "learning_rate": 5.247602567671625e-06, + "loss": 0.5553, + "step": 24428 + }, + { + "epoch": 0.17150189494202986, + "grad_norm": 0.86311274766922, + "learning_rate": 5.196234299402603e-06, + "loss": 0.5609, + "step": 24459 + }, + { + "epoch": 0.17171926109531505, + "grad_norm": 0.8967480659484863, + "learning_rate": 5.145089513937865e-06, + "loss": 0.5513, + "step": 24490 + }, + { + "epoch": 0.17193662724860026, + "grad_norm": 1.0366567373275757, + "learning_rate": 5.094168788439369e-06, + "loss": 0.5606, + "step": 24521 + }, + { + "epoch": 0.17215399340188547, + "grad_norm": 0.9255256652832031, + "learning_rate": 5.043472697540594e-06, + "loss": 0.5531, + "step": 24552 + }, + { + "epoch": 0.1723713595551707, + "grad_norm": 0.921812117099762, + "learning_rate": 4.993001813340012e-06, + "loss": 0.556, + "step": 24583 + }, + { + "epoch": 0.1725887257084559, + "grad_norm": 0.8789228200912476, + "learning_rate": 4.942756705394702e-06, + "loss": 0.5506, + "step": 24614 + }, + { + "epoch": 0.1728060918617411, + "grad_norm": 0.90621018409729, + "learning_rate": 4.892737940713884e-06, + "loss": 0.553, + "step": 24645 + }, + { + "epoch": 0.1730234580150263, + "grad_norm": 0.8067776560783386, + "learning_rate": 4.842946083752511e-06, + "loss": 0.5519, + "step": 24676 + }, + { + "epoch": 0.17324082416831152, + "grad_norm": 0.9581841230392456, + "learning_rate": 4.79338169640493e-06, + "loss": 0.5528, + "step": 24707 + }, + { + "epoch": 0.17345819032159673, + "grad_norm": 0.8731085062026978, + "learning_rate": 4.74404533799851e-06, + "loss": 0.5598, + "step": 24738 + }, + { + "epoch": 0.17367555647488195, + "grad_norm": 0.9524958729743958, + "learning_rate": 4.694937565287344e-06, + "loss": 0.5621, + "step": 24769 + }, + { + "epoch": 0.17389292262816713, + "grad_norm": 0.8445264101028442, + "learning_rate": 4.646058932445985e-06, + "loss": 0.5533, + "step": 24800 + }, + { + "epoch": 0.17411028878145235, + "grad_norm": 0.916232705116272, + "learning_rate": 4.597409991063148e-06, + "loss": 0.5455, + "step": 24831 + }, + { + "epoch": 0.17432765493473756, + "grad_norm": 0.8723344206809998, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.5604, + "step": 24862 + }, + { + "epoch": 0.17454502108802278, + "grad_norm": 1.078605055809021, + "learning_rate": 4.500803376061608e-06, + "loss": 0.5541, + "step": 24893 + }, + { + "epoch": 0.174762387241308, + "grad_norm": 0.994999349117279, + "learning_rate": 4.45284679263541e-06, + "loss": 0.554, + "step": 24924 + }, + { + "epoch": 0.17497975339459318, + "grad_norm": 0.8538706302642822, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.5584, + "step": 24955 + }, + { + "epoch": 0.1751971195478784, + "grad_norm": 0.9859182238578796, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.5518, + "step": 24986 + }, + { + "epoch": 0.1754144857011636, + "grad_norm": 0.9955600500106812, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.5528, + "step": 25017 + }, + { + "epoch": 0.17563185185444882, + "grad_norm": 0.8618113994598389, + "learning_rate": 4.263344549792487e-06, + "loss": 0.5538, + "step": 25048 + }, + { + "epoch": 0.17584921800773404, + "grad_norm": 0.9471727013587952, + "learning_rate": 4.216552684934056e-06, + "loss": 0.5519, + "step": 25079 + }, + { + "epoch": 0.17606658416101922, + "grad_norm": 0.9868952035903931, + "learning_rate": 4.169995358453777e-06, + "loss": 0.5488, + "step": 25110 + }, + { + "epoch": 0.17628395031430444, + "grad_norm": 0.9616749286651611, + "learning_rate": 4.123673095744757e-06, + "loss": 0.5548, + "step": 25141 + }, + { + "epoch": 0.17650131646758965, + "grad_norm": 0.8719627261161804, + "learning_rate": 4.077586419547435e-06, + "loss": 0.5537, + "step": 25172 + }, + { + "epoch": 0.17671868262087487, + "grad_norm": 0.9148105382919312, + "learning_rate": 4.03173584994368e-06, + "loss": 0.5496, + "step": 25203 + }, + { + "epoch": 0.17693604877416008, + "grad_norm": 0.8709606528282166, + "learning_rate": 3.986121904350948e-06, + "loss": 0.5484, + "step": 25234 + }, + { + "epoch": 0.17715341492744527, + "grad_norm": 0.9007585644721985, + "learning_rate": 3.940745097516407e-06, + "loss": 0.5612, + "step": 25265 + }, + { + "epoch": 0.17737078108073048, + "grad_norm": 0.9081540107727051, + "learning_rate": 3.89560594151116e-06, + "loss": 0.548, + "step": 25296 + }, + { + "epoch": 0.1775881472340157, + "grad_norm": 0.9592068195343018, + "learning_rate": 3.850704945724456e-06, + "loss": 0.5524, + "step": 25327 + }, + { + "epoch": 0.1778055133873009, + "grad_norm": 0.84175705909729, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.5527, + "step": 25358 + }, + { + "epoch": 0.17802287954058613, + "grad_norm": 0.9197003841400146, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.5525, + "step": 25389 + }, + { + "epoch": 0.1782402456938713, + "grad_norm": 0.9052679538726807, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.5567, + "step": 25420 + }, + { + "epoch": 0.17845761184715653, + "grad_norm": 0.8521902561187744, + "learning_rate": 3.673492658361677e-06, + "loss": 0.5509, + "step": 25451 + }, + { + "epoch": 0.17867497800044174, + "grad_norm": 0.8626607060432434, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.552, + "step": 25482 + }, + { + "epoch": 0.17889234415372696, + "grad_norm": 0.876883864402771, + "learning_rate": 3.586328522034607e-06, + "loss": 0.5505, + "step": 25513 + }, + { + "epoch": 0.17910971030701217, + "grad_norm": 0.899641215801239, + "learning_rate": 3.543108684200838e-06, + "loss": 0.5564, + "step": 25544 + }, + { + "epoch": 0.17932707646029736, + "grad_norm": 0.8642168045043945, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.5585, + "step": 25575 + }, + { + "epoch": 0.17954444261358257, + "grad_norm": 0.8817245364189148, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.5574, + "step": 25606 + }, + { + "epoch": 0.1797618087668678, + "grad_norm": 0.8665372729301453, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.5573, + "step": 25637 + }, + { + "epoch": 0.179979174920153, + "grad_norm": 0.8398141264915466, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.5528, + "step": 25668 + }, + { + "epoch": 0.18019654107343822, + "grad_norm": 0.8872261047363281, + "learning_rate": 3.33065122541244e-06, + "loss": 0.5579, + "step": 25699 + }, + { + "epoch": 0.1804139072267234, + "grad_norm": 0.8895812630653381, + "learning_rate": 3.288891436313385e-06, + "loss": 0.5521, + "step": 25730 + }, + { + "epoch": 0.18063127338000862, + "grad_norm": 1.0453636646270752, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.5489, + "step": 25761 + }, + { + "epoch": 0.18084863953329383, + "grad_norm": 0.8738374710083008, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.5518, + "step": 25792 + }, + { + "epoch": 0.18106600568657905, + "grad_norm": 0.8890265226364136, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.5531, + "step": 25823 + }, + { + "epoch": 0.18128337183986426, + "grad_norm": 0.8346575498580933, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.5565, + "step": 25854 + }, + { + "epoch": 0.18150073799314945, + "grad_norm": 0.9863383769989014, + "learning_rate": 3.0837769226467e-06, + "loss": 0.555, + "step": 25885 + }, + { + "epoch": 0.18171810414643466, + "grad_norm": 0.903465747833252, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.5502, + "step": 25916 + }, + { + "epoch": 0.18193547029971988, + "grad_norm": 0.9531137347221375, + "learning_rate": 3.003459147240753e-06, + "loss": 0.5514, + "step": 25947 + }, + { + "epoch": 0.1821528364530051, + "grad_norm": 0.8925647139549255, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.55, + "step": 25978 + }, + { + "epoch": 0.1823702026062903, + "grad_norm": 0.920185923576355, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.5526, + "step": 26009 + }, + { + "epoch": 0.1825875687595755, + "grad_norm": 0.813601553440094, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.5469, + "step": 26040 + }, + { + "epoch": 0.1828049349128607, + "grad_norm": 0.8758238554000854, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.5522, + "step": 26071 + }, + { + "epoch": 0.18302230106614592, + "grad_norm": 0.8929989337921143, + "learning_rate": 2.807016506436172e-06, + "loss": 0.5504, + "step": 26102 + }, + { + "epoch": 0.18323966721943113, + "grad_norm": 0.9408402442932129, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.5534, + "step": 26133 + }, + { + "epoch": 0.18345703337271635, + "grad_norm": 0.9396249651908875, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.549, + "step": 26164 + }, + { + "epoch": 0.18367439952600154, + "grad_norm": 0.826866626739502, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.5521, + "step": 26195 + }, + { + "epoch": 0.18389176567928675, + "grad_norm": 0.9264094233512878, + "learning_rate": 2.654367697581725e-06, + "loss": 0.5561, + "step": 26226 + }, + { + "epoch": 0.18410913183257197, + "grad_norm": 0.9079062938690186, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.5476, + "step": 26257 + }, + { + "epoch": 0.18432649798585718, + "grad_norm": 0.8511149287223816, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.553, + "step": 26288 + }, + { + "epoch": 0.1845438641391424, + "grad_norm": 1.0137633085250854, + "learning_rate": 2.54252733160808e-06, + "loss": 0.5512, + "step": 26319 + }, + { + "epoch": 0.18476123029242758, + "grad_norm": 0.8501981496810913, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.5534, + "step": 26350 + }, + { + "epoch": 0.1849785964457128, + "grad_norm": 0.9779496788978577, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.5528, + "step": 26381 + }, + { + "epoch": 0.185195962598998, + "grad_norm": 1.0374913215637207, + "learning_rate": 2.432967814215639e-06, + "loss": 0.5547, + "step": 26412 + }, + { + "epoch": 0.18541332875228322, + "grad_norm": 0.9530802369117737, + "learning_rate": 2.396956760238794e-06, + "loss": 0.5523, + "step": 26443 + }, + { + "epoch": 0.18563069490556844, + "grad_norm": 0.8887461423873901, + "learning_rate": 2.361200778532796e-06, + "loss": 0.5584, + "step": 26474 + }, + { + "epoch": 0.18584806105885363, + "grad_norm": 0.8762808442115784, + "learning_rate": 2.325700272599049e-06, + "loss": 0.5523, + "step": 26505 + }, + { + "epoch": 0.18606542721213884, + "grad_norm": 0.9088528156280518, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.5524, + "step": 26536 + }, + { + "epoch": 0.18628279336542405, + "grad_norm": 0.8415138721466064, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.5591, + "step": 26567 + }, + { + "epoch": 0.18650015951870927, + "grad_norm": 0.9040454030036926, + "learning_rate": 2.220735601173002e-06, + "loss": 0.5512, + "step": 26598 + }, + { + "epoch": 0.18671752567199448, + "grad_norm": 0.8373351693153381, + "learning_rate": 2.186260975614382e-06, + "loss": 0.5625, + "step": 26629 + }, + { + "epoch": 0.18693489182527967, + "grad_norm": 0.9010198712348938, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.5495, + "step": 26660 + }, + { + "epoch": 0.18715225797856488, + "grad_norm": 0.9319360256195068, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.5482, + "step": 26691 + }, + { + "epoch": 0.1873696241318501, + "grad_norm": 0.8212644457817078, + "learning_rate": 2.084383340238455e-06, + "loss": 0.5496, + "step": 26722 + }, + { + "epoch": 0.1875869902851353, + "grad_norm": 0.8496743440628052, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.5578, + "step": 26753 + }, + { + "epoch": 0.18780435643842053, + "grad_norm": 0.9512404799461365, + "learning_rate": 2.017757276036403e-06, + "loss": 0.5512, + "step": 26784 + }, + { + "epoch": 0.18802172259170571, + "grad_norm": 0.8317390084266663, + "learning_rate": 1.984833083927726e-06, + "loss": 0.5529, + "step": 26815 + }, + { + "epoch": 0.18823908874499093, + "grad_norm": 0.9206368327140808, + "learning_rate": 1.952168614849581e-06, + "loss": 0.5565, + "step": 26846 + }, + { + "epoch": 0.18845645489827614, + "grad_norm": 0.8799408674240112, + "learning_rate": 1.919764237416058e-06, + "loss": 0.5492, + "step": 26877 + }, + { + "epoch": 0.18867382105156136, + "grad_norm": 0.8770999908447266, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.5542, + "step": 26908 + }, + { + "epoch": 0.18889118720484657, + "grad_norm": 0.9064630270004272, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.547, + "step": 26939 + }, + { + "epoch": 0.1891085533581318, + "grad_norm": 0.8288804292678833, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.5603, + "step": 26970 + }, + { + "epoch": 0.18932591951141697, + "grad_norm": 0.8370488882064819, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.5507, + "step": 27001 + }, + { + "epoch": 0.1895432856647022, + "grad_norm": 0.9462096095085144, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.5614, + "step": 27032 + }, + { + "epoch": 0.1897606518179874, + "grad_norm": 0.8369693160057068, + "learning_rate": 1.730820169401584e-06, + "loss": 0.5474, + "step": 27063 + }, + { + "epoch": 0.18997801797127262, + "grad_norm": 0.9731806516647339, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.5496, + "step": 27094 + }, + { + "epoch": 0.19019538412455783, + "grad_norm": 0.9371094703674316, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.5478, + "step": 27125 + }, + { + "epoch": 0.19041275027784302, + "grad_norm": 0.8841030597686768, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.5554, + "step": 27156 + }, + { + "epoch": 0.19063011643112823, + "grad_norm": 0.9003316760063171, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.5534, + "step": 27187 + }, + { + "epoch": 0.19084748258441345, + "grad_norm": 1.0026649236679077, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.5577, + "step": 27218 + }, + { + "epoch": 0.19106484873769866, + "grad_norm": 0.9244189262390137, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.5573, + "step": 27249 + }, + { + "epoch": 0.19128221489098388, + "grad_norm": 0.9474987983703613, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.5427, + "step": 27280 + }, + { + "epoch": 0.19149958104426906, + "grad_norm": 0.8063711524009705, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.5399, + "step": 27311 + }, + { + "epoch": 0.19171694719755428, + "grad_norm": 0.9248738288879395, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.5672, + "step": 27342 + }, + { + "epoch": 0.1919343133508395, + "grad_norm": 0.9307349324226379, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.552, + "step": 27373 + }, + { + "epoch": 0.1921516795041247, + "grad_norm": 0.8733169436454773, + "learning_rate": 1.409026746485978e-06, + "loss": 0.5476, + "step": 27404 + }, + { + "epoch": 0.19236904565740992, + "grad_norm": 0.892084002494812, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.5472, + "step": 27435 + }, + { + "epoch": 0.1925864118106951, + "grad_norm": 0.929440975189209, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.5453, + "step": 27466 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.026490470961932e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-27468/training_args.bin b/checkpoint-27468/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-27468/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-30517/config.json b/checkpoint-30517/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-30517/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-30517/generation_config.json b/checkpoint-30517/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-30517/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-30517/model-00001-of-00007.safetensors b/checkpoint-30517/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..97abb2f17a827f9774d5aa677414a589f23ef5be --- /dev/null +++ b/checkpoint-30517/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c41e81deedcb92f32c0979819265f9e6d661b5e330e34fa8c6bef52aea07f00 +size 4886466168 diff --git a/checkpoint-30517/model-00002-of-00007.safetensors b/checkpoint-30517/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-30517/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-30517/model-00003-of-00007.safetensors b/checkpoint-30517/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-30517/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-30517/model-00004-of-00007.safetensors b/checkpoint-30517/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-30517/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-30517/model-00005-of-00007.safetensors b/checkpoint-30517/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-30517/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-30517/model-00006-of-00007.safetensors b/checkpoint-30517/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea06fe8cbc640bfc1ac1a9c2602ab737f12e0995 --- /dev/null +++ b/checkpoint-30517/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16591b9c3d31c4b6c7f17ad0dab0336936bc07e351f0024897753b9bf6ba44ec +size 4999813120 diff --git a/checkpoint-30517/model-00007-of-00007.safetensors b/checkpoint-30517/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7557ca0d07977541c067f3000ab306980af7cff4 --- /dev/null +++ b/checkpoint-30517/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfa19a4c3bb04e7e6fb6b1c326503b5fdfcbd7288ffaf21fe85973e99e4141c0 +size 2571158184 diff --git a/checkpoint-30517/model.safetensors.index.json b/checkpoint-30517/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-30517/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-30517/optimizer.pt b/checkpoint-30517/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..382e13835c36995b1f1a9429dd2278f4204caffc --- /dev/null +++ b/checkpoint-30517/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c686b7b4392d330feb3b39b2685f79139aba947d942be96180cda010c2bd4c6 +size 15385036334 diff --git a/checkpoint-30517/rng_state.pth b/checkpoint-30517/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-30517/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-30517/scheduler.pt b/checkpoint-30517/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d585b62f4a9bc8119a4f1d03cf2bb269e99411b --- /dev/null +++ b/checkpoint-30517/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d91c597306f26392e8da0d33a09fdcca77cfdeb5ad42248045521772fa3d64e +size 1064 diff --git a/checkpoint-30517/trainer_state.json b/checkpoint-30517/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0f13f301dd723e41d60d6b8647eaada64a31db63 --- /dev/null +++ b/checkpoint-30517/trainer_state.json @@ -0,0 +1,6921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21397944838079747, + "eval_steps": 500, + "global_step": 30517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + }, + { + "epoch": 0.06434038137242185, + "grad_norm": 1.2161924839019775, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.6113, + "step": 9176 + }, + { + "epoch": 0.06455774752570706, + "grad_norm": 1.128986120223999, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.6125, + "step": 9207 + }, + { + "epoch": 0.06477511367899226, + "grad_norm": 1.0566622018814087, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.6055, + "step": 9238 + }, + { + "epoch": 0.06499247983227747, + "grad_norm": 1.0849272012710571, + "learning_rate": 4.170395751189495e-05, + "loss": 0.6059, + "step": 9269 + }, + { + "epoch": 0.06520984598556268, + "grad_norm": 1.2344416379928589, + "learning_rate": 4.164137885110921e-05, + "loss": 0.6081, + "step": 9300 + }, + { + "epoch": 0.06542721213884789, + "grad_norm": 1.0132557153701782, + "learning_rate": 4.157861239462495e-05, + "loss": 0.6103, + "step": 9331 + }, + { + "epoch": 0.0656445782921331, + "grad_norm": 1.0176981687545776, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.6086, + "step": 9362 + }, + { + "epoch": 0.0658619444454183, + "grad_norm": 1.2429553270339966, + "learning_rate": 4.145251892991588e-05, + "loss": 0.6091, + "step": 9393 + }, + { + "epoch": 0.06607931059870352, + "grad_norm": 1.1481480598449707, + "learning_rate": 4.138919334463868e-05, + "loss": 0.6048, + "step": 9424 + }, + { + "epoch": 0.06629667675198872, + "grad_norm": 0.9623891711235046, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.5995, + "step": 9455 + }, + { + "epoch": 0.06651404290527393, + "grad_norm": 1.2031786441802979, + "learning_rate": 4.126198804133398e-05, + "loss": 0.6008, + "step": 9486 + }, + { + "epoch": 0.06673140905855915, + "grad_norm": 1.3027530908584595, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.5993, + "step": 9517 + }, + { + "epoch": 0.06694877521184435, + "grad_norm": 1.1712346076965332, + "learning_rate": 4.113404868280107e-05, + "loss": 0.5996, + "step": 9548 + }, + { + "epoch": 0.06716614136512956, + "grad_norm": 1.1397625207901, + "learning_rate": 4.106980553625457e-05, + "loss": 0.5979, + "step": 9579 + }, + { + "epoch": 0.06738350751841476, + "grad_norm": 1.122983694076538, + "learning_rate": 4.100538104413674e-05, + "loss": 0.599, + "step": 9610 + }, + { + "epoch": 0.06760087367169998, + "grad_norm": 1.0782618522644043, + "learning_rate": 4.09407759334692e-05, + "loss": 0.6005, + "step": 9641 + }, + { + "epoch": 0.0678182398249852, + "grad_norm": 1.0563392639160156, + "learning_rate": 4.087599093331186e-05, + "loss": 0.6105, + "step": 9672 + }, + { + "epoch": 0.0680356059782704, + "grad_norm": 1.0684717893600464, + "learning_rate": 4.081102677475462e-05, + "loss": 0.6092, + "step": 9703 + }, + { + "epoch": 0.06825297213155561, + "grad_norm": 1.152811884880066, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.6035, + "step": 9734 + }, + { + "epoch": 0.06847033828484081, + "grad_norm": 1.1413112878799438, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.6018, + "step": 9765 + }, + { + "epoch": 0.06868770443812602, + "grad_norm": 0.981604814529419, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.6035, + "step": 9796 + }, + { + "epoch": 0.06890507059141124, + "grad_norm": 1.0713250637054443, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.597, + "step": 9827 + }, + { + "epoch": 0.06912243674469644, + "grad_norm": 0.949456512928009, + "learning_rate": 4.048354433517794e-05, + "loss": 0.6063, + "step": 9858 + }, + { + "epoch": 0.06933980289798165, + "grad_norm": 1.0702073574066162, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.5963, + "step": 9889 + }, + { + "epoch": 0.06955716905126685, + "grad_norm": 0.9974942803382874, + "learning_rate": 4.035132306369438e-05, + "loss": 0.611, + "step": 9920 + }, + { + "epoch": 0.06977453520455207, + "grad_norm": 1.0833642482757568, + "learning_rate": 4.028495219804555e-05, + "loss": 0.6014, + "step": 9951 + }, + { + "epoch": 0.06999190135783728, + "grad_norm": 1.181322693824768, + "learning_rate": 4.021840884378864e-05, + "loss": 0.599, + "step": 9982 + }, + { + "epoch": 0.07020926751112248, + "grad_norm": 1.0088189840316772, + "learning_rate": 4.015169375185633e-05, + "loss": 0.5955, + "step": 10013 + }, + { + "epoch": 0.0704266336644077, + "grad_norm": 1.0920222997665405, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.5985, + "step": 10044 + }, + { + "epoch": 0.0706439998176929, + "grad_norm": 1.0450975894927979, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.6069, + "step": 10075 + }, + { + "epoch": 0.07086136597097811, + "grad_norm": 1.1613439321517944, + "learning_rate": 3.995052558835377e-05, + "loss": 0.5992, + "step": 10106 + }, + { + "epoch": 0.07107873212426333, + "grad_norm": 1.2720811367034912, + "learning_rate": 3.988313109368017e-05, + "loss": 0.6012, + "step": 10137 + }, + { + "epoch": 0.07129609827754853, + "grad_norm": 1.0315334796905518, + "learning_rate": 3.981556864489504e-05, + "loss": 0.6149, + "step": 10168 + }, + { + "epoch": 0.07151346443083374, + "grad_norm": 1.3320891857147217, + "learning_rate": 3.974783900443142e-05, + "loss": 0.6013, + "step": 10199 + }, + { + "epoch": 0.07173083058411894, + "grad_norm": 1.7277195453643799, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.5983, + "step": 10230 + }, + { + "epoch": 0.07194819673740416, + "grad_norm": 1.1056753396987915, + "learning_rate": 3.961188120762596e-05, + "loss": 0.5952, + "step": 10261 + }, + { + "epoch": 0.07216556289068937, + "grad_norm": 1.3461558818817139, + "learning_rate": 3.954365458554938e-05, + "loss": 0.5975, + "step": 10292 + }, + { + "epoch": 0.07238292904397457, + "grad_norm": 1.1405422687530518, + "learning_rate": 3.947526384030751e-05, + "loss": 0.602, + "step": 10323 + }, + { + "epoch": 0.07260029519725979, + "grad_norm": 1.2912482023239136, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.6014, + "step": 10354 + }, + { + "epoch": 0.07281766135054499, + "grad_norm": 1.1548583507537842, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.6053, + "step": 10385 + }, + { + "epoch": 0.0730350275038302, + "grad_norm": 1.0558302402496338, + "learning_rate": 3.926911459260109e-05, + "loss": 0.6062, + "step": 10416 + }, + { + "epoch": 0.07325239365711542, + "grad_norm": 1.1658024787902832, + "learning_rate": 3.920007509089102e-05, + "loss": 0.6016, + "step": 10447 + }, + { + "epoch": 0.07346975981040062, + "grad_norm": 1.1093552112579346, + "learning_rate": 3.913087534326357e-05, + "loss": 0.6025, + "step": 10478 + }, + { + "epoch": 0.07368712596368583, + "grad_norm": 1.0908467769622803, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.5954, + "step": 10509 + }, + { + "epoch": 0.07390449211697103, + "grad_norm": 1.0856796503067017, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.5983, + "step": 10540 + }, + { + "epoch": 0.07412185827025625, + "grad_norm": 1.0330291986465454, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.5955, + "step": 10571 + }, + { + "epoch": 0.07433922442354146, + "grad_norm": 0.9688258171081543, + "learning_rate": 3.885248953871491e-05, + "loss": 0.6001, + "step": 10602 + }, + { + "epoch": 0.07455659057682666, + "grad_norm": 1.1808383464813232, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.5996, + "step": 10633 + }, + { + "epoch": 0.07477395673011188, + "grad_norm": 1.1267834901809692, + "learning_rate": 3.871235554965218e-05, + "loss": 0.5962, + "step": 10664 + }, + { + "epoch": 0.07499132288339708, + "grad_norm": 1.1001136302947998, + "learning_rate": 3.864205604623078e-05, + "loss": 0.6031, + "step": 10695 + }, + { + "epoch": 0.07520868903668229, + "grad_norm": 1.009621024131775, + "learning_rate": 3.857160259406107e-05, + "loss": 0.5976, + "step": 10726 + }, + { + "epoch": 0.0754260551899675, + "grad_norm": 0.933811366558075, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.604, + "step": 10757 + }, + { + "epoch": 0.0756434213432527, + "grad_norm": 1.2187680006027222, + "learning_rate": 3.843023702543556e-05, + "loss": 0.5983, + "step": 10788 + }, + { + "epoch": 0.07586078749653792, + "grad_norm": 1.0286976099014282, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.596, + "step": 10819 + }, + { + "epoch": 0.07607815364982312, + "grad_norm": 1.0752719640731812, + "learning_rate": 3.828826522492255e-05, + "loss": 0.596, + "step": 10850 + }, + { + "epoch": 0.07629551980310834, + "grad_norm": 1.1018482446670532, + "learning_rate": 3.821705398930713e-05, + "loss": 0.603, + "step": 10881 + }, + { + "epoch": 0.07651288595639355, + "grad_norm": 0.980965793132782, + "learning_rate": 3.814569360103385e-05, + "loss": 0.6021, + "step": 10912 + }, + { + "epoch": 0.07673025210967875, + "grad_norm": 0.9588106870651245, + "learning_rate": 3.807418486539499e-05, + "loss": 0.6017, + "step": 10943 + }, + { + "epoch": 0.07694761826296397, + "grad_norm": 1.2048044204711914, + "learning_rate": 3.80025285893569e-05, + "loss": 0.6049, + "step": 10974 + }, + { + "epoch": 0.07716498441624917, + "grad_norm": 1.0373165607452393, + "learning_rate": 3.793072558155093e-05, + "loss": 0.5914, + "step": 11005 + }, + { + "epoch": 0.07738235056953438, + "grad_norm": 1.0112730264663696, + "learning_rate": 3.785877665226426e-05, + "loss": 0.5949, + "step": 11036 + }, + { + "epoch": 0.0775997167228196, + "grad_norm": 1.0219500064849854, + "learning_rate": 3.778668261343079e-05, + "loss": 0.5918, + "step": 11067 + }, + { + "epoch": 0.0778170828761048, + "grad_norm": 1.1104828119277954, + "learning_rate": 3.771444427862192e-05, + "loss": 0.6029, + "step": 11098 + }, + { + "epoch": 0.07803444902939001, + "grad_norm": 0.9852742552757263, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.5839, + "step": 11129 + }, + { + "epoch": 0.07825181518267521, + "grad_norm": 1.0593205690383911, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.5966, + "step": 11160 + }, + { + "epoch": 0.07846918133596043, + "grad_norm": 1.2858383655548096, + "learning_rate": 3.749687165842753e-05, + "loss": 0.5939, + "step": 11191 + }, + { + "epoch": 0.07868654748924564, + "grad_norm": 1.2006022930145264, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.5965, + "step": 11222 + }, + { + "epoch": 0.07890391364253084, + "grad_norm": 0.9741569757461548, + "learning_rate": 3.735111675341645e-05, + "loss": 0.5971, + "step": 11253 + }, + { + "epoch": 0.07912127979581605, + "grad_norm": 1.0724074840545654, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.5901, + "step": 11284 + }, + { + "epoch": 0.07933864594910126, + "grad_norm": 1.102207064628601, + "learning_rate": 3.720480432728287e-05, + "loss": 0.602, + "step": 11315 + }, + { + "epoch": 0.07955601210238647, + "grad_norm": 1.0976966619491577, + "learning_rate": 3.71314411067092e-05, + "loss": 0.6002, + "step": 11346 + }, + { + "epoch": 0.07977337825567168, + "grad_norm": 1.0534948110580444, + "learning_rate": 3.70579409844715e-05, + "loss": 0.5801, + "step": 11377 + }, + { + "epoch": 0.07999074440895688, + "grad_norm": 1.1666125059127808, + "learning_rate": 3.698430479000865e-05, + "loss": 0.5992, + "step": 11408 + }, + { + "epoch": 0.0802081105622421, + "grad_norm": 1.0316112041473389, + "learning_rate": 3.691053335429509e-05, + "loss": 0.5942, + "step": 11439 + }, + { + "epoch": 0.0804254767155273, + "grad_norm": 1.0085538625717163, + "learning_rate": 3.683662750983147e-05, + "loss": 0.5978, + "step": 11470 + }, + { + "epoch": 0.08064284286881251, + "grad_norm": 1.0875978469848633, + "learning_rate": 3.676258809063518e-05, + "loss": 0.5949, + "step": 11501 + }, + { + "epoch": 0.08086020902209773, + "grad_norm": 1.1905012130737305, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.6018, + "step": 11532 + }, + { + "epoch": 0.08107757517538293, + "grad_norm": 1.1717733144760132, + "learning_rate": 3.661411187164166e-05, + "loss": 0.5986, + "step": 11563 + }, + { + "epoch": 0.08129494132866814, + "grad_norm": 1.053768515586853, + "learning_rate": 3.65396767473784e-05, + "loss": 0.5831, + "step": 11594 + }, + { + "epoch": 0.08151230748195334, + "grad_norm": 1.095425009727478, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.5967, + "step": 11625 + }, + { + "epoch": 0.08172967363523856, + "grad_norm": 1.005953311920166, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.5834, + "step": 11656 + }, + { + "epoch": 0.08194703978852377, + "grad_norm": 1.0991450548171997, + "learning_rate": 3.63155933997859e-05, + "loss": 0.5951, + "step": 11687 + }, + { + "epoch": 0.08216440594180897, + "grad_norm": 1.1069071292877197, + "learning_rate": 3.624064243537758e-05, + "loss": 0.5989, + "step": 11718 + }, + { + "epoch": 0.08238177209509419, + "grad_norm": 1.099542498588562, + "learning_rate": 3.616556462184716e-05, + "loss": 0.5949, + "step": 11749 + }, + { + "epoch": 0.08259913824837939, + "grad_norm": 1.0950508117675781, + "learning_rate": 3.609036080643755e-05, + "loss": 0.5911, + "step": 11780 + }, + { + "epoch": 0.0828165044016646, + "grad_norm": 0.9489970803260803, + "learning_rate": 3.60150318378136e-05, + "loss": 0.5922, + "step": 11811 + }, + { + "epoch": 0.08303387055494982, + "grad_norm": 0.9877674579620361, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.5927, + "step": 11842 + }, + { + "epoch": 0.08325123670823502, + "grad_norm": 1.060177206993103, + "learning_rate": 3.586400184263408e-05, + "loss": 0.5899, + "step": 11873 + }, + { + "epoch": 0.08346860286152023, + "grad_norm": 1.0531491041183472, + "learning_rate": 3.578830252043148e-05, + "loss": 0.6004, + "step": 11904 + }, + { + "epoch": 0.08368596901480543, + "grad_norm": 0.9791742563247681, + "learning_rate": 3.571248145370125e-05, + "loss": 0.5926, + "step": 11935 + }, + { + "epoch": 0.08390333516809065, + "grad_norm": 1.0492030382156372, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.5914, + "step": 11966 + }, + { + "epoch": 0.08412070132137586, + "grad_norm": 1.2035881280899048, + "learning_rate": 3.556047751054378e-05, + "loss": 0.589, + "step": 11997 + }, + { + "epoch": 0.08433806747466106, + "grad_norm": 1.0384137630462646, + "learning_rate": 3.548429634946039e-05, + "loss": 0.5894, + "step": 12028 + }, + { + "epoch": 0.08455543362794628, + "grad_norm": 1.1555323600769043, + "learning_rate": 3.540799687451768e-05, + "loss": 0.5899, + "step": 12059 + }, + { + "epoch": 0.08477279978123148, + "grad_norm": 1.2114530801773071, + "learning_rate": 3.533157994674485e-05, + "loss": 0.5888, + "step": 12090 + }, + { + "epoch": 0.08499016593451669, + "grad_norm": 1.145213007926941, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.5887, + "step": 12121 + }, + { + "epoch": 0.08520753208780191, + "grad_norm": 1.0313454866409302, + "learning_rate": 3.517839718344311e-05, + "loss": 0.598, + "step": 12152 + }, + { + "epoch": 0.08542489824108711, + "grad_norm": 1.018702745437622, + "learning_rate": 3.510163307656086e-05, + "loss": 0.5931, + "step": 12183 + }, + { + "epoch": 0.08564226439437232, + "grad_norm": 1.0365880727767944, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.5954, + "step": 12214 + }, + { + "epoch": 0.08585963054765752, + "grad_norm": 1.0658811330795288, + "learning_rate": 3.494776374368643e-05, + "loss": 0.5953, + "step": 12245 + }, + { + "epoch": 0.08607699670094274, + "grad_norm": 1.046661615371704, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.5786, + "step": 12276 + }, + { + "epoch": 0.08629436285422795, + "grad_norm": 1.0220295190811157, + "learning_rate": 3.479344537543164e-05, + "loss": 0.5879, + "step": 12307 + }, + { + "epoch": 0.08651172900751315, + "grad_norm": 1.0286898612976074, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.5882, + "step": 12338 + }, + { + "epoch": 0.08672909516079837, + "grad_norm": 1.060483694076538, + "learning_rate": 3.463868493762412e-05, + "loss": 0.5887, + "step": 12369 + }, + { + "epoch": 0.08694646131408357, + "grad_norm": 1.264788269996643, + "learning_rate": 3.456114112492418e-05, + "loss": 0.5895, + "step": 12400 + }, + { + "epoch": 0.08716382746736878, + "grad_norm": 0.9751125574111938, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.5846, + "step": 12431 + }, + { + "epoch": 0.087381193620654, + "grad_norm": 1.1198588609695435, + "learning_rate": 3.440573068727905e-05, + "loss": 0.5878, + "step": 12462 + }, + { + "epoch": 0.0875985597739392, + "grad_norm": 1.0425771474838257, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.5903, + "step": 12493 + }, + { + "epoch": 0.08781592592722441, + "grad_norm": 0.9820422530174255, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.5885, + "step": 12524 + }, + { + "epoch": 0.08803329208050961, + "grad_norm": 1.0234076976776123, + "learning_rate": 3.417182116258899e-05, + "loss": 0.5967, + "step": 12555 + }, + { + "epoch": 0.08825065823379483, + "grad_norm": 0.9766228795051575, + "learning_rate": 3.409364314116074e-05, + "loss": 0.5923, + "step": 12586 + }, + { + "epoch": 0.08846802438708004, + "grad_norm": 0.9033060073852539, + "learning_rate": 3.401536249920559e-05, + "loss": 0.5892, + "step": 12617 + }, + { + "epoch": 0.08868539054036524, + "grad_norm": 1.1264934539794922, + "learning_rate": 3.393698012010998e-05, + "loss": 0.5917, + "step": 12648 + }, + { + "epoch": 0.08890275669365046, + "grad_norm": 1.0052556991577148, + "learning_rate": 3.385849688840839e-05, + "loss": 0.5885, + "step": 12679 + }, + { + "epoch": 0.08912012284693566, + "grad_norm": 1.0888159275054932, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.5885, + "step": 12710 + }, + { + "epoch": 0.08933748900022087, + "grad_norm": 1.0458447933197021, + "learning_rate": 3.370123141100578e-05, + "loss": 0.5923, + "step": 12741 + }, + { + "epoch": 0.08955485515350609, + "grad_norm": 1.055434226989746, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.5889, + "step": 12772 + }, + { + "epoch": 0.08977222130679129, + "grad_norm": 1.000533103942871, + "learning_rate": 3.35435731658559e-05, + "loss": 0.5833, + "step": 12803 + }, + { + "epoch": 0.0899895874600765, + "grad_norm": 1.0112768411636353, + "learning_rate": 3.346459897862552e-05, + "loss": 0.5886, + "step": 12834 + }, + { + "epoch": 0.0902069536133617, + "grad_norm": 0.9883559942245483, + "learning_rate": 3.338552926954613e-05, + "loss": 0.5886, + "step": 12865 + }, + { + "epoch": 0.09042431976664692, + "grad_norm": 1.0933095216751099, + "learning_rate": 3.330636493090868e-05, + "loss": 0.5873, + "step": 12896 + }, + { + "epoch": 0.09064168591993213, + "grad_norm": 1.1089582443237305, + "learning_rate": 3.322710685607193e-05, + "loss": 0.5845, + "step": 12927 + }, + { + "epoch": 0.09085905207321733, + "grad_norm": 1.0311237573623657, + "learning_rate": 3.314775593945251e-05, + "loss": 0.588, + "step": 12958 + }, + { + "epoch": 0.09107641822650255, + "grad_norm": 1.0495847463607788, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.5839, + "step": 12989 + }, + { + "epoch": 0.09129378437978775, + "grad_norm": 1.046921730041504, + "learning_rate": 3.298877916376047e-05, + "loss": 0.5877, + "step": 13020 + }, + { + "epoch": 0.09151115053307296, + "grad_norm": 0.9927343130111694, + "learning_rate": 3.290915509871915e-05, + "loss": 0.5898, + "step": 13051 + }, + { + "epoch": 0.09172851668635817, + "grad_norm": 0.9319870471954346, + "learning_rate": 3.282944177993753e-05, + "loss": 0.5886, + "step": 13082 + }, + { + "epoch": 0.09194588283964338, + "grad_norm": 1.0408544540405273, + "learning_rate": 3.274964010696957e-05, + "loss": 0.5946, + "step": 13113 + }, + { + "epoch": 0.09216324899292859, + "grad_norm": 1.0984251499176025, + "learning_rate": 3.266975098036629e-05, + "loss": 0.5813, + "step": 13144 + }, + { + "epoch": 0.09238061514621379, + "grad_norm": 1.0376449823379517, + "learning_rate": 3.258977530166562e-05, + "loss": 0.5855, + "step": 13175 + }, + { + "epoch": 0.092597981299499, + "grad_norm": 0.9627104997634888, + "learning_rate": 3.250971397338227e-05, + "loss": 0.5846, + "step": 13206 + }, + { + "epoch": 0.09281534745278422, + "grad_norm": 1.2040653228759766, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.5872, + "step": 13237 + }, + { + "epoch": 0.09303271360606942, + "grad_norm": 1.0091586112976074, + "learning_rate": 3.234933798294859e-05, + "loss": 0.5911, + "step": 13268 + }, + { + "epoch": 0.09325007975935463, + "grad_norm": 1.0941787958145142, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.5844, + "step": 13299 + }, + { + "epoch": 0.09346744591263983, + "grad_norm": 0.9720754027366638, + "learning_rate": 3.218863024832985e-05, + "loss": 0.5898, + "step": 13330 + }, + { + "epoch": 0.09368481206592505, + "grad_norm": 0.8847430944442749, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.5804, + "step": 13361 + }, + { + "epoch": 0.09390217821921026, + "grad_norm": 1.0070865154266357, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.5916, + "step": 13392 + }, + { + "epoch": 0.09411954437249546, + "grad_norm": 0.9538979530334473, + "learning_rate": 3.194696249871729e-05, + "loss": 0.5984, + "step": 13423 + }, + { + "epoch": 0.09433691052578068, + "grad_norm": 1.0414469242095947, + "learning_rate": 3.186624857814164e-05, + "loss": 0.5828, + "step": 13454 + }, + { + "epoch": 0.0945542766790659, + "grad_norm": 1.0353257656097412, + "learning_rate": 3.178545717288401e-05, + "loss": 0.5835, + "step": 13485 + }, + { + "epoch": 0.0947716428323511, + "grad_norm": 1.0454679727554321, + "learning_rate": 3.170458919466444e-05, + "loss": 0.5909, + "step": 13516 + }, + { + "epoch": 0.09498900898563631, + "grad_norm": 1.246871829032898, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.5829, + "step": 13547 + }, + { + "epoch": 0.09520637513892151, + "grad_norm": 1.0331645011901855, + "learning_rate": 3.154262717052985e-05, + "loss": 0.5919, + "step": 13578 + }, + { + "epoch": 0.09542374129220672, + "grad_norm": 0.8984120488166809, + "learning_rate": 3.146153495233426e-05, + "loss": 0.5853, + "step": 13609 + }, + { + "epoch": 0.09564110744549194, + "grad_norm": 1.0582929849624634, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.5828, + "step": 13640 + }, + { + "epoch": 0.09585847359877714, + "grad_norm": 1.151041865348816, + "learning_rate": 3.129913267924946e-05, + "loss": 0.582, + "step": 13671 + }, + { + "epoch": 0.09607583975206235, + "grad_norm": 0.9678122401237488, + "learning_rate": 3.121782445704782e-05, + "loss": 0.5865, + "step": 13702 + }, + { + "epoch": 0.09629320590534755, + "grad_norm": 1.033832311630249, + "learning_rate": 3.11364460675423e-05, + "loss": 0.5797, + "step": 13733 + }, + { + "epoch": 0.09651057205863277, + "grad_norm": 0.8917058706283569, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.5829, + "step": 13764 + }, + { + "epoch": 0.09672793821191798, + "grad_norm": 0.9396641850471497, + "learning_rate": 3.097348246077728e-05, + "loss": 0.5825, + "step": 13795 + }, + { + "epoch": 0.09694530436520318, + "grad_norm": 1.0871998071670532, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.5851, + "step": 13826 + }, + { + "epoch": 0.0971626705184884, + "grad_norm": 1.1403570175170898, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.5807, + "step": 13857 + }, + { + "epoch": 0.0973800366717736, + "grad_norm": 1.0246673822402954, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.5795, + "step": 13888 + }, + { + "epoch": 0.09759740282505881, + "grad_norm": 0.9431213140487671, + "learning_rate": 3.064675369851637e-05, + "loss": 0.5825, + "step": 13919 + }, + { + "epoch": 0.09781476897834403, + "grad_norm": 1.1028188467025757, + "learning_rate": 3.056490989455289e-05, + "loss": 0.5885, + "step": 13950 + }, + { + "epoch": 0.09803213513162923, + "grad_norm": 1.125482201576233, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.5849, + "step": 13981 + }, + { + "epoch": 0.09824950128491444, + "grad_norm": 0.9831985831260681, + "learning_rate": 3.040103481317539e-05, + "loss": 0.586, + "step": 14012 + }, + { + "epoch": 0.09846686743819964, + "grad_norm": 1.0038244724273682, + "learning_rate": 3.03190053850694e-05, + "loss": 0.5894, + "step": 14043 + }, + { + "epoch": 0.09868423359148486, + "grad_norm": 0.8766770362854004, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.5773, + "step": 14074 + }, + { + "epoch": 0.09890159974477007, + "grad_norm": 0.9358529448509216, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.5887, + "step": 14105 + }, + { + "epoch": 0.09911896589805527, + "grad_norm": 0.8988749384880066, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.5815, + "step": 14136 + }, + { + "epoch": 0.09933633205134049, + "grad_norm": 1.08226478099823, + "learning_rate": 2.999029669712431e-05, + "loss": 0.5944, + "step": 14167 + }, + { + "epoch": 0.09955369820462569, + "grad_norm": 1.046980619430542, + "learning_rate": 2.990797641805408e-05, + "loss": 0.5824, + "step": 14198 + }, + { + "epoch": 0.0997710643579109, + "grad_norm": 0.96205073595047, + "learning_rate": 2.982560075313704e-05, + "loss": 0.5847, + "step": 14229 + }, + { + "epoch": 0.09998843051119612, + "grad_norm": 0.9914370775222778, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.5824, + "step": 14260 + }, + { + "epoch": 0.10020579666448132, + "grad_norm": 0.922265887260437, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.5907, + "step": 14291 + }, + { + "epoch": 0.10042316281776653, + "grad_norm": 0.971159815788269, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.5806, + "step": 14322 + }, + { + "epoch": 0.10064052897105173, + "grad_norm": 1.1327799558639526, + "learning_rate": 2.949556283611942e-05, + "loss": 0.5812, + "step": 14353 + }, + { + "epoch": 0.10085789512433695, + "grad_norm": 1.1632689237594604, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.5813, + "step": 14384 + }, + { + "epoch": 0.10107526127762216, + "grad_norm": 0.9936057329177856, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.5803, + "step": 14415 + }, + { + "epoch": 0.10129262743090736, + "grad_norm": 1.1123740673065186, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.5767, + "step": 14446 + }, + { + "epoch": 0.10150999358419258, + "grad_norm": 0.908643364906311, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.5797, + "step": 14477 + }, + { + "epoch": 0.10172735973747778, + "grad_norm": 0.8761305809020996, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.589, + "step": 14508 + }, + { + "epoch": 0.10194472589076299, + "grad_norm": 1.0904477834701538, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.5796, + "step": 14539 + }, + { + "epoch": 0.1021620920440482, + "grad_norm": 1.060160756111145, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.5837, + "step": 14570 + }, + { + "epoch": 0.1023794581973334, + "grad_norm": 1.0514498949050903, + "learning_rate": 2.883311164593017e-05, + "loss": 0.5792, + "step": 14601 + }, + { + "epoch": 0.10259682435061862, + "grad_norm": 1.0152157545089722, + "learning_rate": 2.875010077160754e-05, + "loss": 0.574, + "step": 14632 + }, + { + "epoch": 0.10281419050390382, + "grad_norm": 0.974791944026947, + "learning_rate": 2.866704757790741e-05, + "loss": 0.5819, + "step": 14663 + }, + { + "epoch": 0.10303155665718904, + "grad_norm": 0.9271855354309082, + "learning_rate": 2.858395300207376e-05, + "loss": 0.5798, + "step": 14694 + }, + { + "epoch": 0.10324892281047425, + "grad_norm": 0.9933458566665649, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.5796, + "step": 14725 + }, + { + "epoch": 0.10346628896375945, + "grad_norm": 0.9245687127113342, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.5771, + "step": 14756 + }, + { + "epoch": 0.10368365511704467, + "grad_norm": 0.9483000040054321, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.5825, + "step": 14787 + }, + { + "epoch": 0.10390102127032987, + "grad_norm": 0.9054533839225769, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.5817, + "step": 14818 + }, + { + "epoch": 0.10411838742361508, + "grad_norm": 1.0200084447860718, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.5788, + "step": 14849 + }, + { + "epoch": 0.1043357535769003, + "grad_norm": 0.9364084005355835, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.5796, + "step": 14880 + }, + { + "epoch": 0.1045531197301855, + "grad_norm": 0.9658653736114502, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.5825, + "step": 14911 + }, + { + "epoch": 0.10477048588347071, + "grad_norm": 0.8612831234931946, + "learning_rate": 2.791781925709473e-05, + "loss": 0.5842, + "step": 14942 + }, + { + "epoch": 0.10498785203675591, + "grad_norm": 0.9673991799354553, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.5776, + "step": 14973 + }, + { + "epoch": 0.10520521819004112, + "grad_norm": 1.092144250869751, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.5785, + "step": 15004 + }, + { + "epoch": 0.10542258434332634, + "grad_norm": 1.0162559747695923, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.5821, + "step": 15035 + }, + { + "epoch": 0.10563995049661154, + "grad_norm": 0.9578939080238342, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.5781, + "step": 15066 + }, + { + "epoch": 0.10585731664989675, + "grad_norm": 0.9934547543525696, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.5826, + "step": 15097 + }, + { + "epoch": 0.10607468280318195, + "grad_norm": 0.9542795419692993, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.583, + "step": 15128 + }, + { + "epoch": 0.10629204895646717, + "grad_norm": 0.9436337947845459, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.5832, + "step": 15159 + }, + { + "epoch": 0.10650941510975238, + "grad_norm": 0.9200606942176819, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.5766, + "step": 15190 + }, + { + "epoch": 0.10672678126303758, + "grad_norm": 1.0009740591049194, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.5835, + "step": 15221 + }, + { + "epoch": 0.1069441474163228, + "grad_norm": 0.9597145915031433, + "learning_rate": 2.708224532974953e-05, + "loss": 0.5728, + "step": 15252 + }, + { + "epoch": 0.107161513569608, + "grad_norm": 1.248867154121399, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.5822, + "step": 15283 + }, + { + "epoch": 0.10737887972289321, + "grad_norm": 0.8899833559989929, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.5843, + "step": 15314 + }, + { + "epoch": 0.10759624587617843, + "grad_norm": 1.0085718631744385, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.5811, + "step": 15345 + }, + { + "epoch": 0.10781361202946363, + "grad_norm": 0.9277573227882385, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.5774, + "step": 15376 + }, + { + "epoch": 0.10803097818274884, + "grad_norm": 1.199010968208313, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.5735, + "step": 15407 + }, + { + "epoch": 0.10824834433603404, + "grad_norm": 0.9361419081687927, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.5834, + "step": 15438 + }, + { + "epoch": 0.10846571048931926, + "grad_norm": 1.05440092086792, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.5721, + "step": 15469 + }, + { + "epoch": 0.10868307664260447, + "grad_norm": 1.0973948240280151, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.5838, + "step": 15500 + }, + { + "epoch": 0.10890044279588967, + "grad_norm": 0.9417588710784912, + "learning_rate": 2.632819298478939e-05, + "loss": 0.5786, + "step": 15531 + }, + { + "epoch": 0.10911780894917489, + "grad_norm": 0.8763983845710754, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.5766, + "step": 15562 + }, + { + "epoch": 0.10933517510246009, + "grad_norm": 1.0105509757995605, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.5896, + "step": 15593 + }, + { + "epoch": 0.1095525412557453, + "grad_norm": 0.9172413349151611, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.5788, + "step": 15624 + }, + { + "epoch": 0.10976990740903052, + "grad_norm": 0.9640869498252869, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.5791, + "step": 15655 + }, + { + "epoch": 0.10998727356231572, + "grad_norm": 1.0987275838851929, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.581, + "step": 15686 + }, + { + "epoch": 0.11020463971560093, + "grad_norm": 1.0418893098831177, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.569, + "step": 15717 + }, + { + "epoch": 0.11042200586888613, + "grad_norm": 1.0216400623321533, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.5834, + "step": 15748 + }, + { + "epoch": 0.11063937202217135, + "grad_norm": 1.0211747884750366, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.5733, + "step": 15779 + }, + { + "epoch": 0.11085673817545656, + "grad_norm": 0.9743130207061768, + "learning_rate": 2.557292666450159e-05, + "loss": 0.5789, + "step": 15810 + }, + { + "epoch": 0.11107410432874176, + "grad_norm": 1.1765626668930054, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.57, + "step": 15841 + }, + { + "epoch": 0.11129147048202698, + "grad_norm": 0.9354963898658752, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.5894, + "step": 15872 + }, + { + "epoch": 0.11150883663531218, + "grad_norm": 0.8743797540664673, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.5716, + "step": 15903 + }, + { + "epoch": 0.11172620278859739, + "grad_norm": 1.1076644659042358, + "learning_rate": 2.52370435981567e-05, + "loss": 0.5787, + "step": 15934 + }, + { + "epoch": 0.1119435689418826, + "grad_norm": 0.9156807065010071, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.5764, + "step": 15965 + }, + { + "epoch": 0.11216093509516781, + "grad_norm": 1.0239089727401733, + "learning_rate": 2.506908222441045e-05, + "loss": 0.5748, + "step": 15996 + }, + { + "epoch": 0.11237830124845302, + "grad_norm": 1.5095417499542236, + "learning_rate": 2.498509989430187e-05, + "loss": 0.5815, + "step": 16027 + }, + { + "epoch": 0.11259566740173822, + "grad_norm": 0.9298838973045349, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.5742, + "step": 16058 + }, + { + "epoch": 0.11281303355502344, + "grad_norm": 0.8956617712974548, + "learning_rate": 2.481713668624899e-05, + "loss": 0.5856, + "step": 16089 + }, + { + "epoch": 0.11303039970830865, + "grad_norm": 0.950095534324646, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.5786, + "step": 16120 + }, + { + "epoch": 0.11324776586159385, + "grad_norm": 1.0230988264083862, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.5771, + "step": 16151 + }, + { + "epoch": 0.11346513201487907, + "grad_norm": 1.1018470525741577, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.57, + "step": 16182 + }, + { + "epoch": 0.11368249816816427, + "grad_norm": 0.9700168371200562, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.5689, + "step": 16213 + }, + { + "epoch": 0.11389986432144948, + "grad_norm": 0.9069929718971252, + "learning_rate": 2.439728136286796e-05, + "loss": 0.5719, + "step": 16244 + }, + { + "epoch": 0.1141172304747347, + "grad_norm": 0.9254815578460693, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.5704, + "step": 16275 + }, + { + "epoch": 0.1143345966280199, + "grad_norm": 0.9150753021240234, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.5754, + "step": 16306 + }, + { + "epoch": 0.11455196278130511, + "grad_norm": 1.003201961517334, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.5746, + "step": 16337 + }, + { + "epoch": 0.11476932893459031, + "grad_norm": 1.1016685962677002, + "learning_rate": 2.406151384602059e-05, + "loss": 0.5763, + "step": 16368 + }, + { + "epoch": 0.11498669508787553, + "grad_norm": 1.0079994201660156, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.5721, + "step": 16399 + }, + { + "epoch": 0.11520406124116074, + "grad_norm": 0.989470899105072, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.5795, + "step": 16430 + }, + { + "epoch": 0.11542142739444594, + "grad_norm": 1.039035439491272, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.5737, + "step": 16461 + }, + { + "epoch": 0.11563879354773116, + "grad_norm": 0.8659546971321106, + "learning_rate": 2.372591577780202e-05, + "loss": 0.5711, + "step": 16492 + }, + { + "epoch": 0.11585615970101636, + "grad_norm": 0.9558688998222351, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.5763, + "step": 16523 + }, + { + "epoch": 0.11607352585430157, + "grad_norm": 1.0017194747924805, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.5772, + "step": 16554 + }, + { + "epoch": 0.11629089200758679, + "grad_norm": 1.0045812129974365, + "learning_rate": 2.347436487983929e-05, + "loss": 0.5744, + "step": 16585 + }, + { + "epoch": 0.11650825816087199, + "grad_norm": 0.8719637393951416, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.5681, + "step": 16616 + }, + { + "epoch": 0.1167256243141572, + "grad_norm": 0.9029743075370789, + "learning_rate": 2.330674878704035e-05, + "loss": 0.5807, + "step": 16647 + }, + { + "epoch": 0.1169429904674424, + "grad_norm": 0.9439691305160522, + "learning_rate": 2.322296892997561e-05, + "loss": 0.5751, + "step": 16678 + }, + { + "epoch": 0.11716035662072762, + "grad_norm": 0.900688648223877, + "learning_rate": 2.313920912646497e-05, + "loss": 0.5772, + "step": 16709 + }, + { + "epoch": 0.11737772277401283, + "grad_norm": 0.8884438872337341, + "learning_rate": 2.305547032172643e-05, + "loss": 0.5758, + "step": 16740 + }, + { + "epoch": 0.11759508892729803, + "grad_norm": 0.9252585172653198, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.5705, + "step": 16771 + }, + { + "epoch": 0.11781245508058324, + "grad_norm": 0.9447957873344421, + "learning_rate": 2.288805948824212e-05, + "loss": 0.566, + "step": 16802 + }, + { + "epoch": 0.11802982123386845, + "grad_norm": 0.9666566252708435, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.5745, + "step": 16833 + }, + { + "epoch": 0.11824718738715366, + "grad_norm": 0.9459251761436462, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.5775, + "step": 16864 + }, + { + "epoch": 0.11846455354043887, + "grad_norm": 0.8863123059272766, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.5716, + "step": 16895 + }, + { + "epoch": 0.11868191969372408, + "grad_norm": 0.9847676753997803, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.5763, + "step": 16926 + }, + { + "epoch": 0.11889928584700929, + "grad_norm": 1.1111658811569214, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.5752, + "step": 16957 + }, + { + "epoch": 0.11911665200029449, + "grad_norm": 1.0046541690826416, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.574, + "step": 16988 + }, + { + "epoch": 0.1193340181535797, + "grad_norm": 0.8580814599990845, + "learning_rate": 2.230292185905114e-05, + "loss": 0.572, + "step": 17019 + }, + { + "epoch": 0.11955138430686492, + "grad_norm": 0.9188304543495178, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.571, + "step": 17050 + }, + { + "epoch": 0.11976875046015012, + "grad_norm": 0.9079185724258423, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.5792, + "step": 17081 + }, + { + "epoch": 0.11998611661343533, + "grad_norm": 0.9194979071617126, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.5753, + "step": 17112 + }, + { + "epoch": 0.12020348276672053, + "grad_norm": 0.8398452997207642, + "learning_rate": 2.196920634473666e-05, + "loss": 0.5653, + "step": 17143 + }, + { + "epoch": 0.12042084892000575, + "grad_norm": 0.9888772368431091, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.5747, + "step": 17174 + }, + { + "epoch": 0.12063821507329096, + "grad_norm": 0.9137700796127319, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.5728, + "step": 17205 + }, + { + "epoch": 0.12085558122657616, + "grad_norm": 1.058064579963684, + "learning_rate": 2.171927553439363e-05, + "loss": 0.5717, + "step": 17236 + }, + { + "epoch": 0.12107294737986138, + "grad_norm": 0.9835705757141113, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.5725, + "step": 17267 + }, + { + "epoch": 0.12129031353314658, + "grad_norm": 0.918863832950592, + "learning_rate": 2.155283853988844e-05, + "loss": 0.5676, + "step": 17298 + }, + { + "epoch": 0.1215076796864318, + "grad_norm": 0.9384900331497192, + "learning_rate": 2.146967792431106e-05, + "loss": 0.5768, + "step": 17329 + }, + { + "epoch": 0.12172504583971701, + "grad_norm": 1.060088038444519, + "learning_rate": 2.138655714793849e-05, + "loss": 0.5642, + "step": 17360 + }, + { + "epoch": 0.12194241199300221, + "grad_norm": 0.999266505241394, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.5669, + "step": 17391 + }, + { + "epoch": 0.12215977814628742, + "grad_norm": 0.8633130788803101, + "learning_rate": 2.122043886437421e-05, + "loss": 0.5661, + "step": 17422 + }, + { + "epoch": 0.12237714429957262, + "grad_norm": 0.9396159052848816, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.5743, + "step": 17453 + }, + { + "epoch": 0.12259451045285784, + "grad_norm": 0.9990928173065186, + "learning_rate": 2.105449118766347e-05, + "loss": 0.5707, + "step": 17484 + }, + { + "epoch": 0.12281187660614305, + "grad_norm": 0.9732767939567566, + "learning_rate": 2.097158366805287e-05, + "loss": 0.571, + "step": 17515 + }, + { + "epoch": 0.12302924275942825, + "grad_norm": 20.362672805786133, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.5586, + "step": 17546 + }, + { + "epoch": 0.12324660891271347, + "grad_norm": 0.976889431476593, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.5693, + "step": 17577 + }, + { + "epoch": 0.12346397506599867, + "grad_norm": 0.907172679901123, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.5714, + "step": 17608 + }, + { + "epoch": 0.12368134121928388, + "grad_norm": 0.8816654086112976, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.5669, + "step": 17639 + }, + { + "epoch": 0.1238987073725691, + "grad_norm": 0.9616197943687439, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.5739, + "step": 17670 + }, + { + "epoch": 0.1241160735258543, + "grad_norm": 0.9188937544822693, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.5689, + "step": 17701 + }, + { + "epoch": 0.12433343967913951, + "grad_norm": 0.9845620393753052, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.5716, + "step": 17732 + }, + { + "epoch": 0.12455080583242471, + "grad_norm": 0.8922098278999329, + "learning_rate": 2.031003855589343e-05, + "loss": 0.5648, + "step": 17763 + }, + { + "epoch": 0.12476817198570993, + "grad_norm": 0.9715010523796082, + "learning_rate": 2.022757379528727e-05, + "loss": 0.5664, + "step": 17794 + }, + { + "epoch": 0.12498553813899514, + "grad_norm": 1.0769156217575073, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.5689, + "step": 17825 + }, + { + "epoch": 0.12520290429228034, + "grad_norm": 0.9304386973381042, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.5772, + "step": 17856 + }, + { + "epoch": 0.12542027044556556, + "grad_norm": 1.0523558855056763, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.5686, + "step": 17887 + }, + { + "epoch": 0.12563763659885077, + "grad_norm": 1.029064655303955, + "learning_rate": 1.989826261153015e-05, + "loss": 0.5629, + "step": 17918 + }, + { + "epoch": 0.12585500275213599, + "grad_norm": 1.0367600917816162, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.5682, + "step": 17949 + }, + { + "epoch": 0.12607236890542117, + "grad_norm": 1.047844648361206, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.571, + "step": 17980 + }, + { + "epoch": 0.1262897350587064, + "grad_norm": 0.9374393820762634, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.5731, + "step": 18011 + }, + { + "epoch": 0.1265071012119916, + "grad_norm": 1.0163381099700928, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.5769, + "step": 18042 + }, + { + "epoch": 0.12672446736527682, + "grad_norm": 0.9243590235710144, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.5697, + "step": 18073 + }, + { + "epoch": 0.12694183351856203, + "grad_norm": 1.0359089374542236, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.5639, + "step": 18104 + }, + { + "epoch": 0.12715919967184722, + "grad_norm": 0.841151773929596, + "learning_rate": 1.932422022132275e-05, + "loss": 0.5792, + "step": 18135 + }, + { + "epoch": 0.12737656582513243, + "grad_norm": 1.0070539712905884, + "learning_rate": 1.924246297341414e-05, + "loss": 0.5669, + "step": 18166 + }, + { + "epoch": 0.12759393197841765, + "grad_norm": 0.9453309178352356, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.5683, + "step": 18197 + }, + { + "epoch": 0.12781129813170286, + "grad_norm": 0.9628680348396301, + "learning_rate": 1.907914431855156e-05, + "loss": 0.5711, + "step": 18228 + }, + { + "epoch": 0.12802866428498808, + "grad_norm": 0.9396767616271973, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.5709, + "step": 18259 + }, + { + "epoch": 0.12824603043827326, + "grad_norm": 0.9093485474586487, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.5689, + "step": 18290 + }, + { + "epoch": 0.12846339659155848, + "grad_norm": 0.8730084896087646, + "learning_rate": 1.883466975572098e-05, + "loss": 0.5744, + "step": 18321 + }, + { + "epoch": 0.1286807627448437, + "grad_norm": 0.9706755876541138, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.5572, + "step": 18352 + }, + { + "epoch": 0.1288981288981289, + "grad_norm": 0.9472910165786743, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.5726, + "step": 18383 + }, + { + "epoch": 0.12911549505141412, + "grad_norm": 0.9355587959289551, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.5673, + "step": 18414 + }, + { + "epoch": 0.1293328612046993, + "grad_norm": 0.9303567409515381, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.5616, + "step": 18445 + }, + { + "epoch": 0.12955022735798452, + "grad_norm": 0.9067112803459167, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.5661, + "step": 18476 + }, + { + "epoch": 0.12976759351126974, + "grad_norm": 0.899079442024231, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.5726, + "step": 18507 + }, + { + "epoch": 0.12998495966455495, + "grad_norm": 0.8478329181671143, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.5633, + "step": 18538 + }, + { + "epoch": 0.13020232581784016, + "grad_norm": 0.910685122013092, + "learning_rate": 1.818586609711774e-05, + "loss": 0.5683, + "step": 18569 + }, + { + "epoch": 0.13041969197112535, + "grad_norm": 0.9179863333702087, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.5753, + "step": 18600 + }, + { + "epoch": 0.13063705812441057, + "grad_norm": 0.9042870402336121, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.5752, + "step": 18631 + }, + { + "epoch": 0.13085442427769578, + "grad_norm": 0.9494644999504089, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.5635, + "step": 18662 + }, + { + "epoch": 0.131071790430981, + "grad_norm": 0.9707177877426147, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.5623, + "step": 18693 + }, + { + "epoch": 0.1312891565842662, + "grad_norm": 0.9590293169021606, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.5799, + "step": 18724 + }, + { + "epoch": 0.1315065227375514, + "grad_norm": 0.9343449473381042, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.5757, + "step": 18755 + }, + { + "epoch": 0.1317238888908366, + "grad_norm": 0.9229467511177063, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.5644, + "step": 18786 + }, + { + "epoch": 0.13194125504412182, + "grad_norm": 0.9312314987182617, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.5693, + "step": 18817 + }, + { + "epoch": 0.13215862119740704, + "grad_norm": 0.8548254370689392, + "learning_rate": 1.746186742108387e-05, + "loss": 0.5713, + "step": 18848 + }, + { + "epoch": 0.13237598735069225, + "grad_norm": 1.0379942655563354, + "learning_rate": 1.73818363812215e-05, + "loss": 0.5696, + "step": 18879 + }, + { + "epoch": 0.13259335350397744, + "grad_norm": 1.0847291946411133, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.5656, + "step": 18910 + }, + { + "epoch": 0.13281071965726265, + "grad_norm": 0.969327449798584, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.5654, + "step": 18941 + }, + { + "epoch": 0.13302808581054787, + "grad_norm": 0.9928266406059265, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.5635, + "step": 18972 + }, + { + "epoch": 0.13324545196383308, + "grad_norm": 0.8415375351905823, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.572, + "step": 19003 + }, + { + "epoch": 0.1334628181171183, + "grad_norm": 0.9909110069274902, + "learning_rate": 1.698298875964369e-05, + "loss": 0.5727, + "step": 19034 + }, + { + "epoch": 0.13368018427040348, + "grad_norm": 1.0183087587356567, + "learning_rate": 1.690348705220684e-05, + "loss": 0.5767, + "step": 19065 + }, + { + "epoch": 0.1338975504236887, + "grad_norm": 0.9055935144424438, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.5628, + "step": 19096 + }, + { + "epoch": 0.1341149165769739, + "grad_norm": 0.8832345008850098, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.5688, + "step": 19127 + }, + { + "epoch": 0.13433228273025913, + "grad_norm": 1.1259726285934448, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.5578, + "step": 19158 + }, + { + "epoch": 0.13454964888354434, + "grad_norm": 0.9167343378067017, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.5636, + "step": 19189 + }, + { + "epoch": 0.13476701503682953, + "grad_norm": 0.9861068725585938, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.5681, + "step": 19220 + }, + { + "epoch": 0.13498438119011474, + "grad_norm": 0.9800103306770325, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.5689, + "step": 19251 + }, + { + "epoch": 0.13520174734339996, + "grad_norm": 0.9900636672973633, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.5625, + "step": 19282 + }, + { + "epoch": 0.13541911349668517, + "grad_norm": 0.9756057858467102, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.5634, + "step": 19313 + }, + { + "epoch": 0.1356364796499704, + "grad_norm": 0.9184322953224182, + "learning_rate": 1.619219056243676e-05, + "loss": 0.5713, + "step": 19344 + }, + { + "epoch": 0.13585384580325557, + "grad_norm": 1.003735065460205, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.5626, + "step": 19375 + }, + { + "epoch": 0.1360712119565408, + "grad_norm": 0.8933300375938416, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.5705, + "step": 19406 + }, + { + "epoch": 0.136288578109826, + "grad_norm": 0.997909426689148, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.5645, + "step": 19437 + }, + { + "epoch": 0.13650594426311122, + "grad_norm": 0.9039232730865479, + "learning_rate": 1.587860447859413e-05, + "loss": 0.5551, + "step": 19468 + }, + { + "epoch": 0.13672331041639643, + "grad_norm": 0.9416874647140503, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.5621, + "step": 19499 + }, + { + "epoch": 0.13694067656968162, + "grad_norm": 0.8743234872817993, + "learning_rate": 1.572242550298298e-05, + "loss": 0.5692, + "step": 19530 + }, + { + "epoch": 0.13715804272296683, + "grad_norm": 1.0159176588058472, + "learning_rate": 1.56444926191065e-05, + "loss": 0.5622, + "step": 19561 + }, + { + "epoch": 0.13737540887625205, + "grad_norm": 0.8633915781974792, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.5688, + "step": 19592 + }, + { + "epoch": 0.13759277502953726, + "grad_norm": 0.9839888215065002, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.5691, + "step": 19623 + }, + { + "epoch": 0.13781014118282248, + "grad_norm": 1.0715723037719727, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.5582, + "step": 19654 + }, + { + "epoch": 0.13802750733610766, + "grad_norm": 1.029173493385315, + "learning_rate": 1.533382561658241e-05, + "loss": 0.5678, + "step": 19685 + }, + { + "epoch": 0.13824487348939288, + "grad_norm": 1.1011470556259155, + "learning_rate": 1.525642938149541e-05, + "loss": 0.5692, + "step": 19716 + }, + { + "epoch": 0.1384622396426781, + "grad_norm": 0.9993789196014404, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.5645, + "step": 19747 + }, + { + "epoch": 0.1386796057959633, + "grad_norm": 1.0202093124389648, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.5703, + "step": 19778 + }, + { + "epoch": 0.13889697194924852, + "grad_norm": 1.0126008987426758, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.5639, + "step": 19809 + }, + { + "epoch": 0.1391143381025337, + "grad_norm": 1.0468281507492065, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.5683, + "step": 19840 + }, + { + "epoch": 0.13933170425581892, + "grad_norm": 0.9329802393913269, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.5597, + "step": 19871 + }, + { + "epoch": 0.13954907040910414, + "grad_norm": 0.891503632068634, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.5728, + "step": 19902 + }, + { + "epoch": 0.13976643656238935, + "grad_norm": 0.9752770662307739, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.564, + "step": 19933 + }, + { + "epoch": 0.13998380271567457, + "grad_norm": 0.8956452012062073, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.5661, + "step": 19964 + }, + { + "epoch": 0.14020116886895975, + "grad_norm": 1.072753667831421, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.562, + "step": 19995 + }, + { + "epoch": 0.14041853502224497, + "grad_norm": 0.8971157670021057, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.5613, + "step": 20026 + }, + { + "epoch": 0.14063590117553018, + "grad_norm": 0.8919452428817749, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.5659, + "step": 20057 + }, + { + "epoch": 0.1408532673288154, + "grad_norm": 0.9752078056335449, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.5687, + "step": 20088 + }, + { + "epoch": 0.1410706334821006, + "grad_norm": 0.9520591497421265, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.5673, + "step": 20119 + }, + { + "epoch": 0.1412879996353858, + "grad_norm": 0.8892295956611633, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.5705, + "step": 20150 + }, + { + "epoch": 0.141505365788671, + "grad_norm": 0.9576200842857361, + "learning_rate": 1.410916653306954e-05, + "loss": 0.5667, + "step": 20181 + }, + { + "epoch": 0.14172273194195623, + "grad_norm": 0.9564182162284851, + "learning_rate": 1.403363351752639e-05, + "loss": 0.5595, + "step": 20212 + }, + { + "epoch": 0.14194009809524144, + "grad_norm": 0.9247251749038696, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.5709, + "step": 20243 + }, + { + "epoch": 0.14215746424852665, + "grad_norm": 0.9523617625236511, + "learning_rate": 1.388293959928911e-05, + "loss": 0.5591, + "step": 20274 + }, + { + "epoch": 0.14237483040181184, + "grad_norm": 0.9751485586166382, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.5678, + "step": 20305 + }, + { + "epoch": 0.14259219655509706, + "grad_norm": 1.0090728998184204, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.5569, + "step": 20336 + }, + { + "epoch": 0.14280956270838227, + "grad_norm": 0.8991780281066895, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.5611, + "step": 20367 + }, + { + "epoch": 0.14302692886166748, + "grad_norm": 0.8665379285812378, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.5639, + "step": 20398 + }, + { + "epoch": 0.1432442950149527, + "grad_norm": 0.9348465204238892, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.5582, + "step": 20429 + }, + { + "epoch": 0.1434616611682379, + "grad_norm": 0.8632979989051819, + "learning_rate": 1.343389583978327e-05, + "loss": 0.5672, + "step": 20460 + }, + { + "epoch": 0.1436790273215231, + "grad_norm": 0.9019519686698914, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.5642, + "step": 20491 + }, + { + "epoch": 0.14389639347480832, + "grad_norm": 0.8994531035423279, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.5663, + "step": 20522 + }, + { + "epoch": 0.14411375962809353, + "grad_norm": 0.9270524978637695, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.5643, + "step": 20553 + }, + { + "epoch": 0.14433112578137874, + "grad_norm": 0.8957355618476868, + "learning_rate": 1.313713250302451e-05, + "loss": 0.5632, + "step": 20584 + }, + { + "epoch": 0.14454849193466393, + "grad_norm": 1.0234413146972656, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.5647, + "step": 20615 + }, + { + "epoch": 0.14476585808794915, + "grad_norm": 0.8956789970397949, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.5635, + "step": 20646 + }, + { + "epoch": 0.14498322424123436, + "grad_norm": 0.883823037147522, + "learning_rate": 1.291596270869846e-05, + "loss": 0.5702, + "step": 20677 + }, + { + "epoch": 0.14520059039451957, + "grad_norm": 0.8809013366699219, + "learning_rate": 1.284251106960927e-05, + "loss": 0.5641, + "step": 20708 + }, + { + "epoch": 0.1454179565478048, + "grad_norm": 0.9803751707077026, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.5604, + "step": 20739 + }, + { + "epoch": 0.14563532270108998, + "grad_norm": 0.8637491464614868, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.558, + "step": 20770 + }, + { + "epoch": 0.1458526888543752, + "grad_norm": 0.8922715187072754, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.5619, + "step": 20801 + }, + { + "epoch": 0.1460700550076604, + "grad_norm": 0.9750674366950989, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.5664, + "step": 20832 + }, + { + "epoch": 0.14628742116094562, + "grad_norm": 1.0473570823669434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.5646, + "step": 20863 + }, + { + "epoch": 0.14650478731423083, + "grad_norm": 1.130385160446167, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.5617, + "step": 20894 + }, + { + "epoch": 0.14672215346751602, + "grad_norm": 0.9984387755393982, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.567, + "step": 20925 + }, + { + "epoch": 0.14693951962080123, + "grad_norm": 0.9383957982063293, + "learning_rate": 1.225990629829241e-05, + "loss": 0.5567, + "step": 20956 + }, + { + "epoch": 0.14715688577408645, + "grad_norm": 0.981935977935791, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.5651, + "step": 20987 + }, + { + "epoch": 0.14737425192737166, + "grad_norm": 0.9774724841117859, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.5652, + "step": 21018 + }, + { + "epoch": 0.14759161808065688, + "grad_norm": 0.9714674949645996, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.561, + "step": 21049 + }, + { + "epoch": 0.14780898423394206, + "grad_norm": 0.8881489038467407, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.5594, + "step": 21080 + }, + { + "epoch": 0.14802635038722728, + "grad_norm": 0.961926281452179, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.5611, + "step": 21111 + }, + { + "epoch": 0.1482437165405125, + "grad_norm": 0.9101502895355225, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.5628, + "step": 21142 + }, + { + "epoch": 0.1484610826937977, + "grad_norm": 0.9001050591468811, + "learning_rate": 1.175766039353062e-05, + "loss": 0.5576, + "step": 21173 + }, + { + "epoch": 0.14867844884708292, + "grad_norm": 0.9724435210227966, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.568, + "step": 21204 + }, + { + "epoch": 0.1488958150003681, + "grad_norm": 0.825156569480896, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.5634, + "step": 21235 + }, + { + "epoch": 0.14911318115365332, + "grad_norm": 0.9625114798545837, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.5667, + "step": 21266 + }, + { + "epoch": 0.14933054730693854, + "grad_norm": 1.0243901014328003, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.5629, + "step": 21297 + }, + { + "epoch": 0.14954791346022375, + "grad_norm": 0.9247808456420898, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.563, + "step": 21328 + }, + { + "epoch": 0.14976527961350897, + "grad_norm": 0.8996061682701111, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.5644, + "step": 21359 + }, + { + "epoch": 0.14998264576679415, + "grad_norm": 0.9766656160354614, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.566, + "step": 21390 + }, + { + "epoch": 0.15020001192007937, + "grad_norm": 0.9848279356956482, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.5648, + "step": 21421 + }, + { + "epoch": 0.15041737807336458, + "grad_norm": 0.972819447517395, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.5573, + "step": 21452 + }, + { + "epoch": 0.1506347442266498, + "grad_norm": 0.8826684951782227, + "learning_rate": 1.105293586433634e-05, + "loss": 0.5625, + "step": 21483 + }, + { + "epoch": 0.150852110379935, + "grad_norm": 0.9768727421760559, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.5609, + "step": 21514 + }, + { + "epoch": 0.1510694765332202, + "grad_norm": 0.9463690519332886, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.5605, + "step": 21545 + }, + { + "epoch": 0.1512868426865054, + "grad_norm": 0.9010226130485535, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.5626, + "step": 21576 + }, + { + "epoch": 0.15150420883979063, + "grad_norm": 0.9793362617492676, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.5629, + "step": 21607 + }, + { + "epoch": 0.15172157499307584, + "grad_norm": 0.8516845107078552, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.5655, + "step": 21638 + }, + { + "epoch": 0.15193894114636106, + "grad_norm": 0.9344280958175659, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.5678, + "step": 21669 + }, + { + "epoch": 0.15215630729964624, + "grad_norm": 0.9141379594802856, + "learning_rate": 1.05689459597817e-05, + "loss": 0.5651, + "step": 21700 + }, + { + "epoch": 0.15237367345293146, + "grad_norm": 0.9709919691085815, + "learning_rate": 1.050044973809246e-05, + "loss": 0.5534, + "step": 21731 + }, + { + "epoch": 0.15259103960621667, + "grad_norm": 0.9237218499183655, + "learning_rate": 1.043211714185722e-05, + "loss": 0.5615, + "step": 21762 + }, + { + "epoch": 0.1528084057595019, + "grad_norm": 0.8852784633636475, + "learning_rate": 1.036394894220003e-05, + "loss": 0.5576, + "step": 21793 + }, + { + "epoch": 0.1530257719127871, + "grad_norm": 0.8654377460479736, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.5583, + "step": 21824 + }, + { + "epoch": 0.1532431380660723, + "grad_norm": 1.0151047706604004, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.5626, + "step": 21855 + }, + { + "epoch": 0.1534605042193575, + "grad_norm": 1.0742695331573486, + "learning_rate": 1.01604384060574e-05, + "loss": 0.5546, + "step": 21886 + }, + { + "epoch": 0.15367787037264272, + "grad_norm": 0.9627267718315125, + "learning_rate": 1.009293546671907e-05, + "loss": 0.5701, + "step": 21917 + }, + { + "epoch": 0.15389523652592793, + "grad_norm": 0.9896987080574036, + "learning_rate": 1.002560075157791e-05, + "loss": 0.5573, + "step": 21948 + }, + { + "epoch": 0.15411260267921315, + "grad_norm": 0.8868485689163208, + "learning_rate": 9.958435020496995e-06, + "loss": 0.5553, + "step": 21979 + }, + { + "epoch": 0.15432996883249833, + "grad_norm": 0.8650690317153931, + "learning_rate": 9.89143903143249e-06, + "loss": 0.5638, + "step": 22010 + }, + { + "epoch": 0.15454733498578355, + "grad_norm": 0.8827528357505798, + "learning_rate": 9.824613540425038e-06, + "loss": 0.5618, + "step": 22041 + }, + { + "epoch": 0.15476470113906876, + "grad_norm": 0.9161486625671387, + "learning_rate": 9.757959301591197e-06, + "loss": 0.5547, + "step": 22072 + }, + { + "epoch": 0.15498206729235398, + "grad_norm": 0.8655954599380493, + "learning_rate": 9.691477067115017e-06, + "loss": 0.5575, + "step": 22103 + }, + { + "epoch": 0.1551994334456392, + "grad_norm": 1.1528652906417847, + "learning_rate": 9.625167587239467e-06, + "loss": 0.5562, + "step": 22134 + }, + { + "epoch": 0.15541679959892438, + "grad_norm": 0.9214157462120056, + "learning_rate": 9.559031610258007e-06, + "loss": 0.5505, + "step": 22165 + }, + { + "epoch": 0.1556341657522096, + "grad_norm": 0.9822834730148315, + "learning_rate": 9.493069882506164e-06, + "loss": 0.5651, + "step": 22196 + }, + { + "epoch": 0.1558515319054948, + "grad_norm": 1.0093454122543335, + "learning_rate": 9.427283148353056e-06, + "loss": 0.5629, + "step": 22227 + }, + { + "epoch": 0.15606889805878002, + "grad_norm": 0.9251008629798889, + "learning_rate": 9.361672150193052e-06, + "loss": 0.5644, + "step": 22258 + }, + { + "epoch": 0.15628626421206523, + "grad_norm": 0.9593933820724487, + "learning_rate": 9.29623762843734e-06, + "loss": 0.5626, + "step": 22289 + }, + { + "epoch": 0.15650363036535042, + "grad_norm": 0.9322303533554077, + "learning_rate": 9.230980321505594e-06, + "loss": 0.5549, + "step": 22320 + }, + { + "epoch": 0.15672099651863564, + "grad_norm": 1.0490275621414185, + "learning_rate": 9.165900965817668e-06, + "loss": 0.5527, + "step": 22351 + }, + { + "epoch": 0.15693836267192085, + "grad_norm": 0.9138365387916565, + "learning_rate": 9.101000295785245e-06, + "loss": 0.5609, + "step": 22382 + }, + { + "epoch": 0.15715572882520606, + "grad_norm": 0.9704885482788086, + "learning_rate": 9.036279043803565e-06, + "loss": 0.5559, + "step": 22413 + }, + { + "epoch": 0.15737309497849128, + "grad_norm": 0.9594223499298096, + "learning_rate": 8.971737940243147e-06, + "loss": 0.5533, + "step": 22444 + }, + { + "epoch": 0.15759046113177647, + "grad_norm": 0.9496825933456421, + "learning_rate": 8.907377713441592e-06, + "loss": 0.5631, + "step": 22475 + }, + { + "epoch": 0.15780782728506168, + "grad_norm": 0.8646016120910645, + "learning_rate": 8.843199089695293e-06, + "loss": 0.561, + "step": 22506 + }, + { + "epoch": 0.1580251934383469, + "grad_norm": 0.8263002038002014, + "learning_rate": 8.779202793251311e-06, + "loss": 0.555, + "step": 22537 + }, + { + "epoch": 0.1582425595916321, + "grad_norm": 0.9070886969566345, + "learning_rate": 8.715389546299149e-06, + "loss": 0.5577, + "step": 22568 + }, + { + "epoch": 0.15845992574491732, + "grad_norm": 0.8829283118247986, + "learning_rate": 8.651760068962617e-06, + "loss": 0.5608, + "step": 22599 + }, + { + "epoch": 0.1586772918982025, + "grad_norm": 0.8605303764343262, + "learning_rate": 8.588315079291733e-06, + "loss": 0.5505, + "step": 22630 + }, + { + "epoch": 0.15889465805148772, + "grad_norm": 0.9638768434524536, + "learning_rate": 8.52505529325457e-06, + "loss": 0.5614, + "step": 22661 + }, + { + "epoch": 0.15911202420477294, + "grad_norm": 0.908811628818512, + "learning_rate": 8.461981424729216e-06, + "loss": 0.5606, + "step": 22692 + }, + { + "epoch": 0.15932939035805815, + "grad_norm": 0.9718073010444641, + "learning_rate": 8.399094185495725e-06, + "loss": 0.5615, + "step": 22723 + }, + { + "epoch": 0.15954675651134337, + "grad_norm": 0.9598197937011719, + "learning_rate": 8.336394285228017e-06, + "loss": 0.5512, + "step": 22754 + }, + { + "epoch": 0.15976412266462856, + "grad_norm": 0.8870306611061096, + "learning_rate": 8.273882431485952e-06, + "loss": 0.5565, + "step": 22785 + }, + { + "epoch": 0.15998148881791377, + "grad_norm": 0.7993106842041016, + "learning_rate": 8.211559329707316e-06, + "loss": 0.5529, + "step": 22816 + }, + { + "epoch": 0.16019885497119898, + "grad_norm": 0.8731540441513062, + "learning_rate": 8.149425683199823e-06, + "loss": 0.5586, + "step": 22847 + }, + { + "epoch": 0.1604162211244842, + "grad_norm": 0.8460251688957214, + "learning_rate": 8.08748219313325e-06, + "loss": 0.5583, + "step": 22878 + }, + { + "epoch": 0.1606335872777694, + "grad_norm": 0.9626048803329468, + "learning_rate": 8.025729558531453e-06, + "loss": 0.5558, + "step": 22909 + }, + { + "epoch": 0.1608509534310546, + "grad_norm": 0.9286885261535645, + "learning_rate": 7.964168476264508e-06, + "loss": 0.5611, + "step": 22940 + }, + { + "epoch": 0.16106831958433981, + "grad_norm": 0.8666262030601501, + "learning_rate": 7.902799641040884e-06, + "loss": 0.5646, + "step": 22971 + }, + { + "epoch": 0.16128568573762503, + "grad_norm": 0.9536890387535095, + "learning_rate": 7.841623745399523e-06, + "loss": 0.5562, + "step": 23002 + }, + { + "epoch": 0.16150305189091024, + "grad_norm": 0.9607664942741394, + "learning_rate": 7.780641479702114e-06, + "loss": 0.5623, + "step": 23033 + }, + { + "epoch": 0.16172041804419546, + "grad_norm": 0.9009374380111694, + "learning_rate": 7.719853532125227e-06, + "loss": 0.5578, + "step": 23064 + }, + { + "epoch": 0.16193778419748064, + "grad_norm": 0.7630103230476379, + "learning_rate": 7.65926058865258e-06, + "loss": 0.5601, + "step": 23095 + }, + { + "epoch": 0.16215515035076586, + "grad_norm": 0.8619088530540466, + "learning_rate": 7.598863333067313e-06, + "loss": 0.5571, + "step": 23126 + }, + { + "epoch": 0.16237251650405107, + "grad_norm": 1.0103073120117188, + "learning_rate": 7.538662446944253e-06, + "loss": 0.5587, + "step": 23157 + }, + { + "epoch": 0.1625898826573363, + "grad_norm": 0.9380328059196472, + "learning_rate": 7.478658609642211e-06, + "loss": 0.555, + "step": 23188 + }, + { + "epoch": 0.1628072488106215, + "grad_norm": 0.9486220479011536, + "learning_rate": 7.418852498296327e-06, + "loss": 0.5658, + "step": 23219 + }, + { + "epoch": 0.1630246149639067, + "grad_norm": 0.9146499633789062, + "learning_rate": 7.359244787810457e-06, + "loss": 0.5503, + "step": 23250 + }, + { + "epoch": 0.1632419811171919, + "grad_norm": 0.911389946937561, + "learning_rate": 7.299836150849493e-06, + "loss": 0.5584, + "step": 23281 + }, + { + "epoch": 0.16345934727047712, + "grad_norm": 0.8847711086273193, + "learning_rate": 7.240627257831847e-06, + "loss": 0.5494, + "step": 23312 + }, + { + "epoch": 0.16367671342376233, + "grad_norm": 0.9155315160751343, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.548, + "step": 23343 + }, + { + "epoch": 0.16389407957704755, + "grad_norm": 0.8847165703773499, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.5512, + "step": 23374 + }, + { + "epoch": 0.16411144573033273, + "grad_norm": 1.0043821334838867, + "learning_rate": 7.064205712766226e-06, + "loss": 0.554, + "step": 23405 + }, + { + "epoch": 0.16432881188361795, + "grad_norm": 0.9789336323738098, + "learning_rate": 7.005802454511129e-06, + "loss": 0.5579, + "step": 23436 + }, + { + "epoch": 0.16454617803690316, + "grad_norm": 0.8675613403320312, + "learning_rate": 6.947602258329639e-06, + "loss": 0.5547, + "step": 23467 + }, + { + "epoch": 0.16476354419018838, + "grad_norm": 1.0360661745071411, + "learning_rate": 6.889605781003078e-06, + "loss": 0.5575, + "step": 23498 + }, + { + "epoch": 0.1649809103434736, + "grad_norm": 0.9654151201248169, + "learning_rate": 6.831813677013776e-06, + "loss": 0.5575, + "step": 23529 + }, + { + "epoch": 0.16519827649675878, + "grad_norm": 0.886508584022522, + "learning_rate": 6.774226598537792e-06, + "loss": 0.5552, + "step": 23560 + }, + { + "epoch": 0.165415642650044, + "grad_norm": 0.8399243950843811, + "learning_rate": 6.716845195437482e-06, + "loss": 0.5494, + "step": 23591 + }, + { + "epoch": 0.1656330088033292, + "grad_norm": 0.876013457775116, + "learning_rate": 6.659670115254168e-06, + "loss": 0.5589, + "step": 23622 + }, + { + "epoch": 0.16585037495661442, + "grad_norm": 0.8546695709228516, + "learning_rate": 6.602702003200872e-06, + "loss": 0.5558, + "step": 23653 + }, + { + "epoch": 0.16606774110989964, + "grad_norm": 0.8829993605613708, + "learning_rate": 6.545941502154992e-06, + "loss": 0.5605, + "step": 23684 + }, + { + "epoch": 0.16628510726318482, + "grad_norm": 0.8759157657623291, + "learning_rate": 6.489389252651057e-06, + "loss": 0.5546, + "step": 23715 + }, + { + "epoch": 0.16650247341647004, + "grad_norm": 0.9579117894172668, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.5507, + "step": 23746 + }, + { + "epoch": 0.16671983956975525, + "grad_norm": 0.9086149334907532, + "learning_rate": 6.376912058649559e-06, + "loss": 0.5594, + "step": 23777 + }, + { + "epoch": 0.16693720572304047, + "grad_norm": 0.889070451259613, + "learning_rate": 6.320988383441845e-06, + "loss": 0.5591, + "step": 23808 + }, + { + "epoch": 0.16715457187632568, + "grad_norm": 0.8501099348068237, + "learning_rate": 6.265275498341452e-06, + "loss": 0.5655, + "step": 23839 + }, + { + "epoch": 0.16737193802961087, + "grad_norm": 0.933879554271698, + "learning_rate": 6.209774032060714e-06, + "loss": 0.5499, + "step": 23870 + }, + { + "epoch": 0.16758930418289608, + "grad_norm": 0.8791343569755554, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.5594, + "step": 23901 + }, + { + "epoch": 0.1678066703361813, + "grad_norm": 0.92324298620224, + "learning_rate": 6.099407858871342e-06, + "loss": 0.5581, + "step": 23932 + }, + { + "epoch": 0.1680240364894665, + "grad_norm": 0.9028039574623108, + "learning_rate": 6.044544397429958e-06, + "loss": 0.5572, + "step": 23963 + }, + { + "epoch": 0.16824140264275173, + "grad_norm": 0.9681089520454407, + "learning_rate": 5.989894845728708e-06, + "loss": 0.5547, + "step": 23994 + }, + { + "epoch": 0.1684587687960369, + "grad_norm": 0.8708662390708923, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.5578, + "step": 24025 + }, + { + "epoch": 0.16867613494932213, + "grad_norm": 0.9106061458587646, + "learning_rate": 5.881239935976762e-06, + "loss": 0.5563, + "step": 24056 + }, + { + "epoch": 0.16889350110260734, + "grad_norm": 0.8620893359184265, + "learning_rate": 5.827235804081954e-06, + "loss": 0.5528, + "step": 24087 + }, + { + "epoch": 0.16911086725589256, + "grad_norm": 0.8461076617240906, + "learning_rate": 5.773448034225221e-06, + "loss": 0.5585, + "step": 24118 + }, + { + "epoch": 0.16932823340917777, + "grad_norm": 0.9462336301803589, + "learning_rate": 5.719877233394228e-06, + "loss": 0.5526, + "step": 24149 + }, + { + "epoch": 0.16954559956246296, + "grad_norm": 0.9454036355018616, + "learning_rate": 5.666524006128191e-06, + "loss": 0.5496, + "step": 24180 + }, + { + "epoch": 0.16976296571574817, + "grad_norm": 0.9001603722572327, + "learning_rate": 5.613388954511015e-06, + "loss": 0.5601, + "step": 24211 + }, + { + "epoch": 0.16998033186903339, + "grad_norm": 0.8742856383323669, + "learning_rate": 5.560472678164552e-06, + "loss": 0.5638, + "step": 24242 + }, + { + "epoch": 0.1701976980223186, + "grad_norm": 0.8686881065368652, + "learning_rate": 5.507775774241775e-06, + "loss": 0.5541, + "step": 24273 + }, + { + "epoch": 0.17041506417560381, + "grad_norm": 0.9935572147369385, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.5545, + "step": 24304 + }, + { + "epoch": 0.170632430328889, + "grad_norm": 0.8736170530319214, + "learning_rate": 5.403042459894597e-06, + "loss": 0.555, + "step": 24335 + }, + { + "epoch": 0.17084979648217422, + "grad_norm": 0.977024495601654, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.5497, + "step": 24366 + }, + { + "epoch": 0.17106716263545943, + "grad_norm": 0.9486203789710999, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.5542, + "step": 24397 + }, + { + "epoch": 0.17128452878874464, + "grad_norm": 0.9372376203536987, + "learning_rate": 5.247602567671625e-06, + "loss": 0.5553, + "step": 24428 + }, + { + "epoch": 0.17150189494202986, + "grad_norm": 0.86311274766922, + "learning_rate": 5.196234299402603e-06, + "loss": 0.5609, + "step": 24459 + }, + { + "epoch": 0.17171926109531505, + "grad_norm": 0.8967480659484863, + "learning_rate": 5.145089513937865e-06, + "loss": 0.5513, + "step": 24490 + }, + { + "epoch": 0.17193662724860026, + "grad_norm": 1.0366567373275757, + "learning_rate": 5.094168788439369e-06, + "loss": 0.5606, + "step": 24521 + }, + { + "epoch": 0.17215399340188547, + "grad_norm": 0.9255256652832031, + "learning_rate": 5.043472697540594e-06, + "loss": 0.5531, + "step": 24552 + }, + { + "epoch": 0.1723713595551707, + "grad_norm": 0.921812117099762, + "learning_rate": 4.993001813340012e-06, + "loss": 0.556, + "step": 24583 + }, + { + "epoch": 0.1725887257084559, + "grad_norm": 0.8789228200912476, + "learning_rate": 4.942756705394702e-06, + "loss": 0.5506, + "step": 24614 + }, + { + "epoch": 0.1728060918617411, + "grad_norm": 0.90621018409729, + "learning_rate": 4.892737940713884e-06, + "loss": 0.553, + "step": 24645 + }, + { + "epoch": 0.1730234580150263, + "grad_norm": 0.8067776560783386, + "learning_rate": 4.842946083752511e-06, + "loss": 0.5519, + "step": 24676 + }, + { + "epoch": 0.17324082416831152, + "grad_norm": 0.9581841230392456, + "learning_rate": 4.79338169640493e-06, + "loss": 0.5528, + "step": 24707 + }, + { + "epoch": 0.17345819032159673, + "grad_norm": 0.8731085062026978, + "learning_rate": 4.74404533799851e-06, + "loss": 0.5598, + "step": 24738 + }, + { + "epoch": 0.17367555647488195, + "grad_norm": 0.9524958729743958, + "learning_rate": 4.694937565287344e-06, + "loss": 0.5621, + "step": 24769 + }, + { + "epoch": 0.17389292262816713, + "grad_norm": 0.8445264101028442, + "learning_rate": 4.646058932445985e-06, + "loss": 0.5533, + "step": 24800 + }, + { + "epoch": 0.17411028878145235, + "grad_norm": 0.916232705116272, + "learning_rate": 4.597409991063148e-06, + "loss": 0.5455, + "step": 24831 + }, + { + "epoch": 0.17432765493473756, + "grad_norm": 0.8723344206809998, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.5604, + "step": 24862 + }, + { + "epoch": 0.17454502108802278, + "grad_norm": 1.078605055809021, + "learning_rate": 4.500803376061608e-06, + "loss": 0.5541, + "step": 24893 + }, + { + "epoch": 0.174762387241308, + "grad_norm": 0.994999349117279, + "learning_rate": 4.45284679263541e-06, + "loss": 0.554, + "step": 24924 + }, + { + "epoch": 0.17497975339459318, + "grad_norm": 0.8538706302642822, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.5584, + "step": 24955 + }, + { + "epoch": 0.1751971195478784, + "grad_norm": 0.9859182238578796, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.5518, + "step": 24986 + }, + { + "epoch": 0.1754144857011636, + "grad_norm": 0.9955600500106812, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.5528, + "step": 25017 + }, + { + "epoch": 0.17563185185444882, + "grad_norm": 0.8618113994598389, + "learning_rate": 4.263344549792487e-06, + "loss": 0.5538, + "step": 25048 + }, + { + "epoch": 0.17584921800773404, + "grad_norm": 0.9471727013587952, + "learning_rate": 4.216552684934056e-06, + "loss": 0.5519, + "step": 25079 + }, + { + "epoch": 0.17606658416101922, + "grad_norm": 0.9868952035903931, + "learning_rate": 4.169995358453777e-06, + "loss": 0.5488, + "step": 25110 + }, + { + "epoch": 0.17628395031430444, + "grad_norm": 0.9616749286651611, + "learning_rate": 4.123673095744757e-06, + "loss": 0.5548, + "step": 25141 + }, + { + "epoch": 0.17650131646758965, + "grad_norm": 0.8719627261161804, + "learning_rate": 4.077586419547435e-06, + "loss": 0.5537, + "step": 25172 + }, + { + "epoch": 0.17671868262087487, + "grad_norm": 0.9148105382919312, + "learning_rate": 4.03173584994368e-06, + "loss": 0.5496, + "step": 25203 + }, + { + "epoch": 0.17693604877416008, + "grad_norm": 0.8709606528282166, + "learning_rate": 3.986121904350948e-06, + "loss": 0.5484, + "step": 25234 + }, + { + "epoch": 0.17715341492744527, + "grad_norm": 0.9007585644721985, + "learning_rate": 3.940745097516407e-06, + "loss": 0.5612, + "step": 25265 + }, + { + "epoch": 0.17737078108073048, + "grad_norm": 0.9081540107727051, + "learning_rate": 3.89560594151116e-06, + "loss": 0.548, + "step": 25296 + }, + { + "epoch": 0.1775881472340157, + "grad_norm": 0.9592068195343018, + "learning_rate": 3.850704945724456e-06, + "loss": 0.5524, + "step": 25327 + }, + { + "epoch": 0.1778055133873009, + "grad_norm": 0.84175705909729, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.5527, + "step": 25358 + }, + { + "epoch": 0.17802287954058613, + "grad_norm": 0.9197003841400146, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.5525, + "step": 25389 + }, + { + "epoch": 0.1782402456938713, + "grad_norm": 0.9052679538726807, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.5567, + "step": 25420 + }, + { + "epoch": 0.17845761184715653, + "grad_norm": 0.8521902561187744, + "learning_rate": 3.673492658361677e-06, + "loss": 0.5509, + "step": 25451 + }, + { + "epoch": 0.17867497800044174, + "grad_norm": 0.8626607060432434, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.552, + "step": 25482 + }, + { + "epoch": 0.17889234415372696, + "grad_norm": 0.876883864402771, + "learning_rate": 3.586328522034607e-06, + "loss": 0.5505, + "step": 25513 + }, + { + "epoch": 0.17910971030701217, + "grad_norm": 0.899641215801239, + "learning_rate": 3.543108684200838e-06, + "loss": 0.5564, + "step": 25544 + }, + { + "epoch": 0.17932707646029736, + "grad_norm": 0.8642168045043945, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.5585, + "step": 25575 + }, + { + "epoch": 0.17954444261358257, + "grad_norm": 0.8817245364189148, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.5574, + "step": 25606 + }, + { + "epoch": 0.1797618087668678, + "grad_norm": 0.8665372729301453, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.5573, + "step": 25637 + }, + { + "epoch": 0.179979174920153, + "grad_norm": 0.8398141264915466, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.5528, + "step": 25668 + }, + { + "epoch": 0.18019654107343822, + "grad_norm": 0.8872261047363281, + "learning_rate": 3.33065122541244e-06, + "loss": 0.5579, + "step": 25699 + }, + { + "epoch": 0.1804139072267234, + "grad_norm": 0.8895812630653381, + "learning_rate": 3.288891436313385e-06, + "loss": 0.5521, + "step": 25730 + }, + { + "epoch": 0.18063127338000862, + "grad_norm": 1.0453636646270752, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.5489, + "step": 25761 + }, + { + "epoch": 0.18084863953329383, + "grad_norm": 0.8738374710083008, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.5518, + "step": 25792 + }, + { + "epoch": 0.18106600568657905, + "grad_norm": 0.8890265226364136, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.5531, + "step": 25823 + }, + { + "epoch": 0.18128337183986426, + "grad_norm": 0.8346575498580933, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.5565, + "step": 25854 + }, + { + "epoch": 0.18150073799314945, + "grad_norm": 0.9863383769989014, + "learning_rate": 3.0837769226467e-06, + "loss": 0.555, + "step": 25885 + }, + { + "epoch": 0.18171810414643466, + "grad_norm": 0.903465747833252, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.5502, + "step": 25916 + }, + { + "epoch": 0.18193547029971988, + "grad_norm": 0.9531137347221375, + "learning_rate": 3.003459147240753e-06, + "loss": 0.5514, + "step": 25947 + }, + { + "epoch": 0.1821528364530051, + "grad_norm": 0.8925647139549255, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.55, + "step": 25978 + }, + { + "epoch": 0.1823702026062903, + "grad_norm": 0.920185923576355, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.5526, + "step": 26009 + }, + { + "epoch": 0.1825875687595755, + "grad_norm": 0.813601553440094, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.5469, + "step": 26040 + }, + { + "epoch": 0.1828049349128607, + "grad_norm": 0.8758238554000854, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.5522, + "step": 26071 + }, + { + "epoch": 0.18302230106614592, + "grad_norm": 0.8929989337921143, + "learning_rate": 2.807016506436172e-06, + "loss": 0.5504, + "step": 26102 + }, + { + "epoch": 0.18323966721943113, + "grad_norm": 0.9408402442932129, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.5534, + "step": 26133 + }, + { + "epoch": 0.18345703337271635, + "grad_norm": 0.9396249651908875, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.549, + "step": 26164 + }, + { + "epoch": 0.18367439952600154, + "grad_norm": 0.826866626739502, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.5521, + "step": 26195 + }, + { + "epoch": 0.18389176567928675, + "grad_norm": 0.9264094233512878, + "learning_rate": 2.654367697581725e-06, + "loss": 0.5561, + "step": 26226 + }, + { + "epoch": 0.18410913183257197, + "grad_norm": 0.9079062938690186, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.5476, + "step": 26257 + }, + { + "epoch": 0.18432649798585718, + "grad_norm": 0.8511149287223816, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.553, + "step": 26288 + }, + { + "epoch": 0.1845438641391424, + "grad_norm": 1.0137633085250854, + "learning_rate": 2.54252733160808e-06, + "loss": 0.5512, + "step": 26319 + }, + { + "epoch": 0.18476123029242758, + "grad_norm": 0.8501981496810913, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.5534, + "step": 26350 + }, + { + "epoch": 0.1849785964457128, + "grad_norm": 0.9779496788978577, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.5528, + "step": 26381 + }, + { + "epoch": 0.185195962598998, + "grad_norm": 1.0374913215637207, + "learning_rate": 2.432967814215639e-06, + "loss": 0.5547, + "step": 26412 + }, + { + "epoch": 0.18541332875228322, + "grad_norm": 0.9530802369117737, + "learning_rate": 2.396956760238794e-06, + "loss": 0.5523, + "step": 26443 + }, + { + "epoch": 0.18563069490556844, + "grad_norm": 0.8887461423873901, + "learning_rate": 2.361200778532796e-06, + "loss": 0.5584, + "step": 26474 + }, + { + "epoch": 0.18584806105885363, + "grad_norm": 0.8762808442115784, + "learning_rate": 2.325700272599049e-06, + "loss": 0.5523, + "step": 26505 + }, + { + "epoch": 0.18606542721213884, + "grad_norm": 0.9088528156280518, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.5524, + "step": 26536 + }, + { + "epoch": 0.18628279336542405, + "grad_norm": 0.8415138721466064, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.5591, + "step": 26567 + }, + { + "epoch": 0.18650015951870927, + "grad_norm": 0.9040454030036926, + "learning_rate": 2.220735601173002e-06, + "loss": 0.5512, + "step": 26598 + }, + { + "epoch": 0.18671752567199448, + "grad_norm": 0.8373351693153381, + "learning_rate": 2.186260975614382e-06, + "loss": 0.5625, + "step": 26629 + }, + { + "epoch": 0.18693489182527967, + "grad_norm": 0.9010198712348938, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.5495, + "step": 26660 + }, + { + "epoch": 0.18715225797856488, + "grad_norm": 0.9319360256195068, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.5482, + "step": 26691 + }, + { + "epoch": 0.1873696241318501, + "grad_norm": 0.8212644457817078, + "learning_rate": 2.084383340238455e-06, + "loss": 0.5496, + "step": 26722 + }, + { + "epoch": 0.1875869902851353, + "grad_norm": 0.8496743440628052, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.5578, + "step": 26753 + }, + { + "epoch": 0.18780435643842053, + "grad_norm": 0.9512404799461365, + "learning_rate": 2.017757276036403e-06, + "loss": 0.5512, + "step": 26784 + }, + { + "epoch": 0.18802172259170571, + "grad_norm": 0.8317390084266663, + "learning_rate": 1.984833083927726e-06, + "loss": 0.5529, + "step": 26815 + }, + { + "epoch": 0.18823908874499093, + "grad_norm": 0.9206368327140808, + "learning_rate": 1.952168614849581e-06, + "loss": 0.5565, + "step": 26846 + }, + { + "epoch": 0.18845645489827614, + "grad_norm": 0.8799408674240112, + "learning_rate": 1.919764237416058e-06, + "loss": 0.5492, + "step": 26877 + }, + { + "epoch": 0.18867382105156136, + "grad_norm": 0.8770999908447266, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.5542, + "step": 26908 + }, + { + "epoch": 0.18889118720484657, + "grad_norm": 0.9064630270004272, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.547, + "step": 26939 + }, + { + "epoch": 0.1891085533581318, + "grad_norm": 0.8288804292678833, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.5603, + "step": 26970 + }, + { + "epoch": 0.18932591951141697, + "grad_norm": 0.8370488882064819, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.5507, + "step": 27001 + }, + { + "epoch": 0.1895432856647022, + "grad_norm": 0.9462096095085144, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.5614, + "step": 27032 + }, + { + "epoch": 0.1897606518179874, + "grad_norm": 0.8369693160057068, + "learning_rate": 1.730820169401584e-06, + "loss": 0.5474, + "step": 27063 + }, + { + "epoch": 0.18997801797127262, + "grad_norm": 0.9731806516647339, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.5496, + "step": 27094 + }, + { + "epoch": 0.19019538412455783, + "grad_norm": 0.9371094703674316, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.5478, + "step": 27125 + }, + { + "epoch": 0.19041275027784302, + "grad_norm": 0.8841030597686768, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.5554, + "step": 27156 + }, + { + "epoch": 0.19063011643112823, + "grad_norm": 0.9003316760063171, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.5534, + "step": 27187 + }, + { + "epoch": 0.19084748258441345, + "grad_norm": 1.0026649236679077, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.5577, + "step": 27218 + }, + { + "epoch": 0.19106484873769866, + "grad_norm": 0.9244189262390137, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.5573, + "step": 27249 + }, + { + "epoch": 0.19128221489098388, + "grad_norm": 0.9474987983703613, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.5427, + "step": 27280 + }, + { + "epoch": 0.19149958104426906, + "grad_norm": 0.8063711524009705, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.5399, + "step": 27311 + }, + { + "epoch": 0.19171694719755428, + "grad_norm": 0.9248738288879395, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.5672, + "step": 27342 + }, + { + "epoch": 0.1919343133508395, + "grad_norm": 0.9307349324226379, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.552, + "step": 27373 + }, + { + "epoch": 0.1921516795041247, + "grad_norm": 0.8733169436454773, + "learning_rate": 1.409026746485978e-06, + "loss": 0.5476, + "step": 27404 + }, + { + "epoch": 0.19236904565740992, + "grad_norm": 0.892084002494812, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.5472, + "step": 27435 + }, + { + "epoch": 0.1925864118106951, + "grad_norm": 0.929440975189209, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.5453, + "step": 27466 + }, + { + "epoch": 0.19280377796398032, + "grad_norm": 0.9144075512886047, + "learning_rate": 1.3268374740224548e-06, + "loss": 0.5462, + "step": 27497 + }, + { + "epoch": 0.19302114411726554, + "grad_norm": 0.990763247013092, + "learning_rate": 1.2999749375008807e-06, + "loss": 0.5496, + "step": 27528 + }, + { + "epoch": 0.19323851027055075, + "grad_norm": 0.9114391803741455, + "learning_rate": 1.2733798525409346e-06, + "loss": 0.5468, + "step": 27559 + }, + { + "epoch": 0.19345587642383597, + "grad_norm": 0.9742738008499146, + "learning_rate": 1.2470525192645383e-06, + "loss": 0.5495, + "step": 27590 + }, + { + "epoch": 0.19367324257712115, + "grad_norm": 0.9740488529205322, + "learning_rate": 1.2209932347720666e-06, + "loss": 0.5475, + "step": 27621 + }, + { + "epoch": 0.19389060873040637, + "grad_norm": 0.8361204266548157, + "learning_rate": 1.1952022931389972e-06, + "loss": 0.5607, + "step": 27652 + }, + { + "epoch": 0.19410797488369158, + "grad_norm": 0.948847770690918, + "learning_rate": 1.1696799854126083e-06, + "loss": 0.5548, + "step": 27683 + }, + { + "epoch": 0.1943253410369768, + "grad_norm": 0.912126362323761, + "learning_rate": 1.1444265996086694e-06, + "loss": 0.5611, + "step": 27714 + }, + { + "epoch": 0.194542707190262, + "grad_norm": 0.9116827845573425, + "learning_rate": 1.119442420708211e-06, + "loss": 0.5505, + "step": 27745 + }, + { + "epoch": 0.1947600733435472, + "grad_norm": 0.8980589509010315, + "learning_rate": 1.0947277306542964e-06, + "loss": 0.5588, + "step": 27776 + }, + { + "epoch": 0.1949774394968324, + "grad_norm": 0.8803214430809021, + "learning_rate": 1.0702828083488353e-06, + "loss": 0.5559, + "step": 27807 + }, + { + "epoch": 0.19519480565011763, + "grad_norm": 0.9106587767601013, + "learning_rate": 1.0461079296494647e-06, + "loss": 0.5522, + "step": 27838 + }, + { + "epoch": 0.19541217180340284, + "grad_norm": 0.871856689453125, + "learning_rate": 1.0222033673663978e-06, + "loss": 0.5552, + "step": 27869 + }, + { + "epoch": 0.19562953795668805, + "grad_norm": 0.8996092677116394, + "learning_rate": 9.985693912593713e-07, + "loss": 0.5579, + "step": 27900 + }, + { + "epoch": 0.19584690410997324, + "grad_norm": 0.9253154397010803, + "learning_rate": 9.752062680346035e-07, + "loss": 0.5521, + "step": 27931 + }, + { + "epoch": 0.19606427026325846, + "grad_norm": 0.9413549900054932, + "learning_rate": 9.521142613417494e-07, + "loss": 0.5511, + "step": 27962 + }, + { + "epoch": 0.19628163641654367, + "grad_norm": 1.0021663904190063, + "learning_rate": 9.292936317709722e-07, + "loss": 0.5522, + "step": 27993 + }, + { + "epoch": 0.19649900256982888, + "grad_norm": 0.910113513469696, + "learning_rate": 9.067446368499793e-07, + "loss": 0.5423, + "step": 28024 + }, + { + "epoch": 0.1967163687231141, + "grad_norm": 0.9891364574432373, + "learning_rate": 8.844675310411055e-07, + "loss": 0.5527, + "step": 28055 + }, + { + "epoch": 0.19693373487639929, + "grad_norm": 0.8617024421691895, + "learning_rate": 8.6246256573847e-07, + "loss": 0.5519, + "step": 28086 + }, + { + "epoch": 0.1971511010296845, + "grad_norm": 0.8602914214134216, + "learning_rate": 8.407299892651127e-07, + "loss": 0.5533, + "step": 28117 + }, + { + "epoch": 0.19736846718296971, + "grad_norm": 0.8551098704338074, + "learning_rate": 8.19270046870202e-07, + "loss": 0.5579, + "step": 28148 + }, + { + "epoch": 0.19758583333625493, + "grad_norm": 0.8507790565490723, + "learning_rate": 7.980829807262752e-07, + "loss": 0.5472, + "step": 28179 + }, + { + "epoch": 0.19780319948954014, + "grad_norm": 0.9151542782783508, + "learning_rate": 7.771690299264889e-07, + "loss": 0.554, + "step": 28210 + }, + { + "epoch": 0.19802056564282533, + "grad_norm": 0.9357946515083313, + "learning_rate": 7.565284304819426e-07, + "loss": 0.5436, + "step": 28241 + }, + { + "epoch": 0.19823793179611054, + "grad_norm": 0.9336006045341492, + "learning_rate": 7.361614153189922e-07, + "loss": 0.5583, + "step": 28272 + }, + { + "epoch": 0.19845529794939576, + "grad_norm": 1.0290875434875488, + "learning_rate": 7.160682142766328e-07, + "loss": 0.5559, + "step": 28303 + }, + { + "epoch": 0.19867266410268097, + "grad_norm": 0.817413866519928, + "learning_rate": 6.962490541039091e-07, + "loss": 0.5593, + "step": 28334 + }, + { + "epoch": 0.1988900302559662, + "grad_norm": 0.8156284689903259, + "learning_rate": 6.767041584573531e-07, + "loss": 0.5562, + "step": 28365 + }, + { + "epoch": 0.19910739640925137, + "grad_norm": 0.8753253221511841, + "learning_rate": 6.574337478984532e-07, + "loss": 0.547, + "step": 28396 + }, + { + "epoch": 0.1993247625625366, + "grad_norm": 0.8520931005477905, + "learning_rate": 6.384380398911732e-07, + "loss": 0.5489, + "step": 28427 + }, + { + "epoch": 0.1995421287158218, + "grad_norm": 0.8849247097969055, + "learning_rate": 6.197172487994951e-07, + "loss": 0.5454, + "step": 28458 + }, + { + "epoch": 0.19975949486910702, + "grad_norm": 0.8934049606323242, + "learning_rate": 6.012715858850021e-07, + "loss": 0.5533, + "step": 28489 + }, + { + "epoch": 0.19997686102239223, + "grad_norm": 0.8335791230201721, + "learning_rate": 5.831012593044971e-07, + "loss": 0.5455, + "step": 28520 + }, + { + "epoch": 0.20019422717567742, + "grad_norm": 0.8317821621894836, + "learning_rate": 5.652064741076435e-07, + "loss": 0.5497, + "step": 28551 + }, + { + "epoch": 0.20041159332896263, + "grad_norm": 0.8726831078529358, + "learning_rate": 5.475874322346558e-07, + "loss": 0.5481, + "step": 28582 + }, + { + "epoch": 0.20062895948224785, + "grad_norm": 0.9333974719047546, + "learning_rate": 5.30244332514035e-07, + "loss": 0.5449, + "step": 28613 + }, + { + "epoch": 0.20084632563553306, + "grad_norm": 0.8533473610877991, + "learning_rate": 5.131773706602977e-07, + "loss": 0.5519, + "step": 28644 + }, + { + "epoch": 0.20106369178881828, + "grad_norm": 0.9046511054039001, + "learning_rate": 4.963867392717897e-07, + "loss": 0.5518, + "step": 28675 + }, + { + "epoch": 0.20128105794210346, + "grad_norm": 0.8681894540786743, + "learning_rate": 4.798726278285093e-07, + "loss": 0.5636, + "step": 28706 + }, + { + "epoch": 0.20149842409538868, + "grad_norm": 0.8766895532608032, + "learning_rate": 4.6363522268995097e-07, + "loss": 0.5567, + "step": 28737 + }, + { + "epoch": 0.2017157902486739, + "grad_norm": 0.8112674951553345, + "learning_rate": 4.4767470709302927e-07, + "loss": 0.5524, + "step": 28768 + }, + { + "epoch": 0.2019331564019591, + "grad_norm": 0.8420015573501587, + "learning_rate": 4.319912611499971e-07, + "loss": 0.5551, + "step": 28799 + }, + { + "epoch": 0.20215052255524432, + "grad_norm": 0.9402871131896973, + "learning_rate": 4.1658506184640564e-07, + "loss": 0.5502, + "step": 28830 + }, + { + "epoch": 0.2023678887085295, + "grad_norm": 0.9862967729568481, + "learning_rate": 4.0145628303911996e-07, + "loss": 0.5551, + "step": 28861 + }, + { + "epoch": 0.20258525486181472, + "grad_norm": 0.8821402192115784, + "learning_rate": 3.866050954543565e-07, + "loss": 0.5502, + "step": 28892 + }, + { + "epoch": 0.20280262101509994, + "grad_norm": 0.9727492332458496, + "learning_rate": 3.720316666857432e-07, + "loss": 0.5536, + "step": 28923 + }, + { + "epoch": 0.20301998716838515, + "grad_norm": 0.8336794376373291, + "learning_rate": 3.5773616119244845e-07, + "loss": 0.5394, + "step": 28954 + }, + { + "epoch": 0.20323735332167037, + "grad_norm": 0.9501764178276062, + "learning_rate": 3.437187402973052e-07, + "loss": 0.5422, + "step": 28985 + }, + { + "epoch": 0.20345471947495555, + "grad_norm": 0.8623166680335999, + "learning_rate": 3.2997956218500104e-07, + "loss": 0.5492, + "step": 29016 + }, + { + "epoch": 0.20367208562824077, + "grad_norm": 0.918738603591919, + "learning_rate": 3.165187819003018e-07, + "loss": 0.5512, + "step": 29047 + }, + { + "epoch": 0.20388945178152598, + "grad_norm": 0.9619165062904358, + "learning_rate": 3.033365513462755e-07, + "loss": 0.5431, + "step": 29078 + }, + { + "epoch": 0.2041068179348112, + "grad_norm": 1.0213710069656372, + "learning_rate": 2.9043301928260437e-07, + "loss": 0.557, + "step": 29109 + }, + { + "epoch": 0.2043241840880964, + "grad_norm": 0.8963508009910583, + "learning_rate": 2.7780833132389773e-07, + "loss": 0.5531, + "step": 29140 + }, + { + "epoch": 0.2045415502413816, + "grad_norm": 0.8586835861206055, + "learning_rate": 2.6546262993803473e-07, + "loss": 0.5526, + "step": 29171 + }, + { + "epoch": 0.2047589163946668, + "grad_norm": 0.9326640367507935, + "learning_rate": 2.533960544445879e-07, + "loss": 0.5506, + "step": 29202 + }, + { + "epoch": 0.20497628254795203, + "grad_norm": 0.8862287998199463, + "learning_rate": 2.416087410132134e-07, + "loss": 0.5489, + "step": 29233 + }, + { + "epoch": 0.20519364870123724, + "grad_norm": 0.8497974276542664, + "learning_rate": 2.301008226621465e-07, + "loss": 0.5488, + "step": 29264 + }, + { + "epoch": 0.20541101485452246, + "grad_norm": 0.8367092609405518, + "learning_rate": 2.1887242925668073e-07, + "loss": 0.5443, + "step": 29295 + }, + { + "epoch": 0.20562838100780764, + "grad_norm": 0.9077014923095703, + "learning_rate": 2.0792368750770785e-07, + "loss": 0.5498, + "step": 29326 + }, + { + "epoch": 0.20584574716109286, + "grad_norm": 0.8816044330596924, + "learning_rate": 1.9725472097028851e-07, + "loss": 0.5586, + "step": 29357 + }, + { + "epoch": 0.20606311331437807, + "grad_norm": 0.8443469405174255, + "learning_rate": 1.8686565004226718e-07, + "loss": 0.5516, + "step": 29388 + }, + { + "epoch": 0.20628047946766329, + "grad_norm": 0.8607436418533325, + "learning_rate": 1.7675659196288995e-07, + "loss": 0.5553, + "step": 29419 + }, + { + "epoch": 0.2064978456209485, + "grad_norm": 0.8874748349189758, + "learning_rate": 1.6692766081150556e-07, + "loss": 0.554, + "step": 29450 + }, + { + "epoch": 0.2067152117742337, + "grad_norm": 0.8883106708526611, + "learning_rate": 1.5737896750626647e-07, + "loss": 0.5477, + "step": 29481 + }, + { + "epoch": 0.2069325779275189, + "grad_norm": 1.0266894102096558, + "learning_rate": 1.4811061980287976e-07, + "loss": 0.5575, + "step": 29512 + }, + { + "epoch": 0.20714994408080412, + "grad_norm": 0.8207541704177856, + "learning_rate": 1.3912272229338886e-07, + "loss": 0.5452, + "step": 29543 + }, + { + "epoch": 0.20736731023408933, + "grad_norm": 0.8725236654281616, + "learning_rate": 1.3041537640499645e-07, + "loss": 0.5478, + "step": 29574 + }, + { + "epoch": 0.20758467638737454, + "grad_norm": 0.904036283493042, + "learning_rate": 1.2198868039891564e-07, + "loss": 0.5472, + "step": 29605 + }, + { + "epoch": 0.20780204254065973, + "grad_norm": 0.813684344291687, + "learning_rate": 1.138427293692651e-07, + "loss": 0.5474, + "step": 29636 + }, + { + "epoch": 0.20801940869394495, + "grad_norm": 0.9013733863830566, + "learning_rate": 1.0597761524199778e-07, + "loss": 0.5545, + "step": 29667 + }, + { + "epoch": 0.20823677484723016, + "grad_norm": 0.7993410229682922, + "learning_rate": 9.839342677385455e-08, + "loss": 0.5508, + "step": 29698 + }, + { + "epoch": 0.20845414100051537, + "grad_norm": 0.8909832239151001, + "learning_rate": 9.109024955137325e-08, + "loss": 0.5526, + "step": 29729 + }, + { + "epoch": 0.2086715071538006, + "grad_norm": 0.8944339156150818, + "learning_rate": 8.406816598991729e-08, + "loss": 0.5482, + "step": 29760 + }, + { + "epoch": 0.20888887330708578, + "grad_norm": 0.9002408981323242, + "learning_rate": 7.73272553327431e-08, + "loss": 0.5513, + "step": 29791 + }, + { + "epoch": 0.209106239460371, + "grad_norm": 0.8991537690162659, + "learning_rate": 7.086759365011186e-08, + "loss": 0.5455, + "step": 29822 + }, + { + "epoch": 0.2093236056136562, + "grad_norm": 0.8367495536804199, + "learning_rate": 6.468925383842639e-08, + "loss": 0.5473, + "step": 29853 + }, + { + "epoch": 0.20954097176694142, + "grad_norm": 0.9395527243614197, + "learning_rate": 5.8792305619415067e-08, + "loss": 0.5483, + "step": 29884 + }, + { + "epoch": 0.20975833792022663, + "grad_norm": 0.8302240371704102, + "learning_rate": 5.317681553933529e-08, + "loss": 0.5479, + "step": 29915 + }, + { + "epoch": 0.20997570407351182, + "grad_norm": 0.8468736410140991, + "learning_rate": 4.78428469682296e-08, + "loss": 0.5475, + "step": 29946 + }, + { + "epoch": 0.21019307022679704, + "grad_norm": 0.9445973634719849, + "learning_rate": 4.2790460099206844e-08, + "loss": 0.551, + "step": 29977 + }, + { + "epoch": 0.21041043638008225, + "grad_norm": 0.9727746248245239, + "learning_rate": 3.801971194777043e-08, + "loss": 0.5519, + "step": 30008 + }, + { + "epoch": 0.21062780253336746, + "grad_norm": 0.8241652250289917, + "learning_rate": 3.353065635115782e-08, + "loss": 0.5509, + "step": 30039 + }, + { + "epoch": 0.21084516868665268, + "grad_norm": 0.9168987274169922, + "learning_rate": 2.93233439677576e-08, + "loss": 0.5555, + "step": 30070 + }, + { + "epoch": 0.21106253483993787, + "grad_norm": 0.8214668035507202, + "learning_rate": 2.539782227651555e-08, + "loss": 0.5541, + "step": 30101 + }, + { + "epoch": 0.21127990099322308, + "grad_norm": 0.8454606533050537, + "learning_rate": 2.175413557641004e-08, + "loss": 0.5541, + "step": 30132 + }, + { + "epoch": 0.2114972671465083, + "grad_norm": 0.8458406329154968, + "learning_rate": 1.839232498594967e-08, + "loss": 0.5574, + "step": 30163 + }, + { + "epoch": 0.2117146332997935, + "grad_norm": 0.8554269671440125, + "learning_rate": 1.5312428442712522e-08, + "loss": 0.5505, + "step": 30194 + }, + { + "epoch": 0.21193199945307872, + "grad_norm": 0.8779774308204651, + "learning_rate": 1.2514480702913168e-08, + "loss": 0.5508, + "step": 30225 + }, + { + "epoch": 0.2121493656063639, + "grad_norm": 0.9194534420967102, + "learning_rate": 9.998513341005766e-09, + "loss": 0.5497, + "step": 30256 + }, + { + "epoch": 0.21236673175964912, + "grad_norm": 0.8502130508422852, + "learning_rate": 7.764554749345454e-09, + "loss": 0.5585, + "step": 30287 + }, + { + "epoch": 0.21258409791293434, + "grad_norm": 0.9179947972297668, + "learning_rate": 5.812630137849717e-09, + "loss": 0.5517, + "step": 30318 + }, + { + "epoch": 0.21280146406621955, + "grad_norm": 0.8943513035774231, + "learning_rate": 4.142761533723616e-09, + "loss": 0.5517, + "step": 30349 + }, + { + "epoch": 0.21301883021950477, + "grad_norm": 0.8810676336288452, + "learning_rate": 2.7549677812044317e-09, + "loss": 0.552, + "step": 30380 + }, + { + "epoch": 0.21323619637278995, + "grad_norm": 0.9478496313095093, + "learning_rate": 1.6492645413590525e-09, + "loss": 0.555, + "step": 30411 + }, + { + "epoch": 0.21345356252607517, + "grad_norm": 0.8304721713066101, + "learning_rate": 8.256642918980096e-10, + "loss": 0.5483, + "step": 30442 + }, + { + "epoch": 0.21367092867936038, + "grad_norm": 0.9064939022064209, + "learning_rate": 2.841763270367004e-10, + "loss": 0.5407, + "step": 30473 + }, + { + "epoch": 0.2138882948326456, + "grad_norm": 0.8490622043609619, + "learning_rate": 2.480675739269245e-11, + "loss": 0.5502, + "step": 30504 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.251434749612104e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-30517/training_args.bin b/checkpoint-30517/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-30517/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-3052/config.json b/checkpoint-3052/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-3052/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-3052/generation_config.json b/checkpoint-3052/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-3052/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-3052/model-00001-of-00007.safetensors b/checkpoint-3052/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3fc0aee913a4cd51ce7b8e3b392efe3b3d369642 --- /dev/null +++ b/checkpoint-3052/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39802451e7e940df199a9aa294e6de40ec14e451247d1301db93d75f7417a99a +size 4886466168 diff --git a/checkpoint-3052/model-00002-of-00007.safetensors b/checkpoint-3052/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-3052/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-3052/model-00003-of-00007.safetensors b/checkpoint-3052/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-3052/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-3052/model-00004-of-00007.safetensors b/checkpoint-3052/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-3052/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-3052/model-00005-of-00007.safetensors b/checkpoint-3052/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-3052/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-3052/model-00006-of-00007.safetensors b/checkpoint-3052/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ebfea43f34c388a37f5a06df993b508d9285ffc5 --- /dev/null +++ b/checkpoint-3052/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:789e69d89c25eeaed4359d7a40a2ba05521852b4b3b4356a26fc3acf20b79ea3 +size 4999813120 diff --git a/checkpoint-3052/model-00007-of-00007.safetensors b/checkpoint-3052/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2d512b82a2939eaab24f982bf51f242ccfc8cdc8 --- /dev/null +++ b/checkpoint-3052/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f4746e0b85febfaae1f7e1a398b9f13359b0310020b4f78bcff8811e8ad1bf3 +size 2571158184 diff --git a/checkpoint-3052/model.safetensors.index.json b/checkpoint-3052/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-3052/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-3052/optimizer.pt b/checkpoint-3052/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5e24f1a5cfb2677e7996437e788d988bfae43d7 --- /dev/null +++ b/checkpoint-3052/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e49659214c52875e76586822a39f1fd4a6e666747bb3e26cbbb7138600f669 +size 15385036334 diff --git a/checkpoint-3052/rng_state.pth b/checkpoint-3052/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-3052/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-3052/scheduler.pt b/checkpoint-3052/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b2454d919340cd4d989697a74a27016c58dc3aa --- /dev/null +++ b/checkpoint-3052/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed9d7fea0b9f468b8c97fd491e0f5a211b8ff197e5f8111c42fc974ecafed4c +size 1064 diff --git a/checkpoint-3052/trainer_state.json b/checkpoint-3052/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..405d3710b2a09c66aeb5b1f5be8c126f7ed072f0 --- /dev/null +++ b/checkpoint-3052/trainer_state.json @@ -0,0 +1,719 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.021400048381498636, + "eval_steps": 500, + "global_step": 3052, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.251656078846591e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3052/training_args.bin b/checkpoint-3052/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-3052/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-6104/config.json b/checkpoint-6104/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-6104/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-6104/generation_config.json b/checkpoint-6104/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-6104/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-6104/model-00001-of-00007.safetensors b/checkpoint-6104/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7b984eaa638e9c6c625cb9a886e134a7746a7504 --- /dev/null +++ b/checkpoint-6104/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:073f3079011b3df396486f51e172b2f1dc1f8a54d8ed4bf2838ae13315fe0eb2 +size 4886466168 diff --git a/checkpoint-6104/model-00002-of-00007.safetensors b/checkpoint-6104/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-6104/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-6104/model-00003-of-00007.safetensors b/checkpoint-6104/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-6104/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-6104/model-00004-of-00007.safetensors b/checkpoint-6104/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-6104/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-6104/model-00005-of-00007.safetensors b/checkpoint-6104/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-6104/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-6104/model-00006-of-00007.safetensors b/checkpoint-6104/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fee2867f3b5f1e6d17dbb9bf137ca02f1630cb16 --- /dev/null +++ b/checkpoint-6104/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d7fb8a377758887f24823600edf0c2647002aecf4f780e0778ae525f044786 +size 4999813120 diff --git a/checkpoint-6104/model-00007-of-00007.safetensors b/checkpoint-6104/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..425f7dfb977ac5b20caa9c71ee2e806b6c69d0ce --- /dev/null +++ b/checkpoint-6104/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b854e42fe29c464d7aa3c11d1aa7fad45689a8b015796d36278a876e55936e +size 2571158184 diff --git a/checkpoint-6104/model.safetensors.index.json b/checkpoint-6104/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-6104/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-6104/optimizer.pt b/checkpoint-6104/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0f980e02bfcfc01296ed7f375d6e12367cf7adf --- /dev/null +++ b/checkpoint-6104/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a34fceb5044e96d334d4b9cd7d17767d4f44d7c80c9484d97fb9f7e016b02481 +size 15385036334 diff --git a/checkpoint-6104/rng_state.pth b/checkpoint-6104/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-6104/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-6104/scheduler.pt b/checkpoint-6104/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f5e5514ba898102fcdb5731bba1ae1c2957e6e5 --- /dev/null +++ b/checkpoint-6104/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107e0617754026d870a7da422dabb716a8dc7d3a550066ff507e37f8f0818429 +size 1064 diff --git a/checkpoint-6104/trainer_state.json b/checkpoint-6104/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad7a8001c567ac8075132614cccd2d3afe24a64d --- /dev/null +++ b/checkpoint-6104/trainer_state.json @@ -0,0 +1,1405 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.04280009676299727, + "eval_steps": 500, + "global_step": 6104, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.503312157693182e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6104/training_args.bin b/checkpoint-6104/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-6104/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/checkpoint-9156/config.json b/checkpoint-9156/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-9156/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-9156/generation_config.json b/checkpoint-9156/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-9156/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-9156/model-00001-of-00007.safetensors b/checkpoint-9156/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..943ea24d6cfb1a0da4e85ba4036d1be03ef5ee6f --- /dev/null +++ b/checkpoint-9156/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067f7db1cf974220271e734e538227c098d66c84d2ba0c39ec77feaa16f29c1c +size 4886466168 diff --git a/checkpoint-9156/model-00002-of-00007.safetensors b/checkpoint-9156/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-9156/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-9156/model-00003-of-00007.safetensors b/checkpoint-9156/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-9156/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-9156/model-00004-of-00007.safetensors b/checkpoint-9156/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-9156/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-9156/model-00005-of-00007.safetensors b/checkpoint-9156/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-9156/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-9156/model-00006-of-00007.safetensors b/checkpoint-9156/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..812517f8abbe5eea9ee7fb82517d03ff5c7823e1 --- /dev/null +++ b/checkpoint-9156/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7154f7d3e47fd5447182a7fa43710555d5d61c03f4ca2f72e1ee3906ccfa01 +size 4999813120 diff --git a/checkpoint-9156/model-00007-of-00007.safetensors b/checkpoint-9156/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8b2124404c1ad854fcbb61e8b077f4f266122ec --- /dev/null +++ b/checkpoint-9156/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc5a22a5170ef2ac97163954ad40859a76d51b77e089a64053cdae93d7929eb +size 2571158184 diff --git a/checkpoint-9156/model.safetensors.index.json b/checkpoint-9156/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-9156/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-9156/optimizer.pt b/checkpoint-9156/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d33900218ef7ea90bc85f7ff391ec4ceecfd716 --- /dev/null +++ b/checkpoint-9156/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33552352fc7bfd138ed5bc0de5123fbb5326d46c61c39124531fb3de0d94c36e +size 15385036334 diff --git a/checkpoint-9156/rng_state.pth b/checkpoint-9156/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-9156/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-9156/scheduler.pt b/checkpoint-9156/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5456a295e7e9e24785bebf5e96ccb62dbbac4f62 --- /dev/null +++ b/checkpoint-9156/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3f410c61b11096714461ebc2a4aa1b4573d0d0c3eb997bda14fafb34cdc922 +size 1064 diff --git a/checkpoint-9156/trainer_state.json b/checkpoint-9156/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..70afa8efc0b590f947a0aebef729ec0e16b3c7b1 --- /dev/null +++ b/checkpoint-9156/trainer_state.json @@ -0,0 +1,2098 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.0642001451444959, + "eval_steps": 500, + "global_step": 9156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021736615328520894, + "grad_norm": 5.709163665771484, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.9228, + "step": 31 + }, + { + "epoch": 0.0004347323065704179, + "grad_norm": 3.9758756160736084, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8276, + "step": 62 + }, + { + "epoch": 0.0006520984598556268, + "grad_norm": 3.721677303314209, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.7897, + "step": 93 + }, + { + "epoch": 0.0008694646131408358, + "grad_norm": 3.384953498840332, + "learning_rate": 4.062909567496723e-06, + "loss": 0.7641, + "step": 124 + }, + { + "epoch": 0.0010868307664260446, + "grad_norm": 3.19750714302063, + "learning_rate": 5.078636959370905e-06, + "loss": 0.758, + "step": 155 + }, + { + "epoch": 0.0013041969197112536, + "grad_norm": 3.49003267288208, + "learning_rate": 6.094364351245085e-06, + "loss": 0.7644, + "step": 186 + }, + { + "epoch": 0.0015215630729964625, + "grad_norm": 3.323367118835449, + "learning_rate": 7.110091743119267e-06, + "loss": 0.7606, + "step": 217 + }, + { + "epoch": 0.0017389292262816715, + "grad_norm": 3.465822219848633, + "learning_rate": 8.125819134993446e-06, + "loss": 0.7505, + "step": 248 + }, + { + "epoch": 0.0019562953795668804, + "grad_norm": 2.7967450618743896, + "learning_rate": 9.141546526867629e-06, + "loss": 0.7593, + "step": 279 + }, + { + "epoch": 0.002173661532852089, + "grad_norm": 3.5493738651275635, + "learning_rate": 1.015727391874181e-05, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.0023910276861372984, + "grad_norm": 3.514606237411499, + "learning_rate": 1.117300131061599e-05, + "loss": 0.7497, + "step": 341 + }, + { + "epoch": 0.0026083938394225073, + "grad_norm": 3.0246028900146484, + "learning_rate": 1.218872870249017e-05, + "loss": 0.7473, + "step": 372 + }, + { + "epoch": 0.002825759992707716, + "grad_norm": 3.01147723197937, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.739, + "step": 403 + }, + { + "epoch": 0.003043126145992925, + "grad_norm": 3.566333293914795, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.7447, + "step": 434 + }, + { + "epoch": 0.003260492299278134, + "grad_norm": 3.8698947429656982, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.7447, + "step": 465 + }, + { + "epoch": 0.003477858452563343, + "grad_norm": 2.567028760910034, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.7496, + "step": 496 + }, + { + "epoch": 0.003695224605848552, + "grad_norm": 2.8029377460479736, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.7324, + "step": 527 + }, + { + "epoch": 0.003912590759133761, + "grad_norm": 2.862530469894409, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.7371, + "step": 558 + }, + { + "epoch": 0.00412995691241897, + "grad_norm": 2.9063901901245117, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.7398, + "step": 589 + }, + { + "epoch": 0.004347323065704178, + "grad_norm": 3.187814235687256, + "learning_rate": 2.031454783748362e-05, + "loss": 0.7362, + "step": 620 + }, + { + "epoch": 0.004564689218989388, + "grad_norm": 2.6158969402313232, + "learning_rate": 2.13302752293578e-05, + "loss": 0.7405, + "step": 651 + }, + { + "epoch": 0.004782055372274597, + "grad_norm": 2.702423334121704, + "learning_rate": 2.234600262123198e-05, + "loss": 0.7313, + "step": 682 + }, + { + "epoch": 0.004999421525559805, + "grad_norm": 2.88313889503479, + "learning_rate": 2.336173001310616e-05, + "loss": 0.7337, + "step": 713 + }, + { + "epoch": 0.0052167876788450146, + "grad_norm": 2.8978841304779053, + "learning_rate": 2.437745740498034e-05, + "loss": 0.73, + "step": 744 + }, + { + "epoch": 0.005434153832130223, + "grad_norm": 2.791414976119995, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.7297, + "step": 775 + }, + { + "epoch": 0.005651519985415432, + "grad_norm": 2.432821035385132, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.7326, + "step": 806 + }, + { + "epoch": 0.0058688861387006415, + "grad_norm": 2.430279493331909, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.7333, + "step": 837 + }, + { + "epoch": 0.00608625229198585, + "grad_norm": 2.5513761043548584, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.7296, + "step": 868 + }, + { + "epoch": 0.006303618445271059, + "grad_norm": 2.478562116622925, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.7303, + "step": 899 + }, + { + "epoch": 0.006520984598556268, + "grad_norm": 2.5496723651885986, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.7303, + "step": 930 + }, + { + "epoch": 0.006738350751841477, + "grad_norm": 2.114060640335083, + "learning_rate": 3.148754914809961e-05, + "loss": 0.7184, + "step": 961 + }, + { + "epoch": 0.006955716905126686, + "grad_norm": 2.325977325439453, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.7268, + "step": 992 + }, + { + "epoch": 0.0071730830584118945, + "grad_norm": 2.0549662113189697, + "learning_rate": 3.351900393184797e-05, + "loss": 0.724, + "step": 1023 + }, + { + "epoch": 0.007390449211697104, + "grad_norm": 1.913522481918335, + "learning_rate": 3.453473132372215e-05, + "loss": 0.7206, + "step": 1054 + }, + { + "epoch": 0.007607815364982313, + "grad_norm": 2.0651443004608154, + "learning_rate": 3.555045871559633e-05, + "loss": 0.7239, + "step": 1085 + }, + { + "epoch": 0.007825181518267521, + "grad_norm": 2.2482309341430664, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.728, + "step": 1116 + }, + { + "epoch": 0.00804254767155273, + "grad_norm": 2.349695920944214, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.721, + "step": 1147 + }, + { + "epoch": 0.00825991382483794, + "grad_norm": 2.218843460083008, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.7248, + "step": 1178 + }, + { + "epoch": 0.00847727997812315, + "grad_norm": 2.0839340686798096, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.7233, + "step": 1209 + }, + { + "epoch": 0.008694646131408357, + "grad_norm": 2.0757343769073486, + "learning_rate": 4.062909567496724e-05, + "loss": 0.7166, + "step": 1240 + }, + { + "epoch": 0.008912012284693566, + "grad_norm": 2.052342653274536, + "learning_rate": 4.164482306684142e-05, + "loss": 0.7256, + "step": 1271 + }, + { + "epoch": 0.009129378437978775, + "grad_norm": 2.1202704906463623, + "learning_rate": 4.26605504587156e-05, + "loss": 0.7131, + "step": 1302 + }, + { + "epoch": 0.009346744591263985, + "grad_norm": 2.1945958137512207, + "learning_rate": 4.367627785058978e-05, + "loss": 0.708, + "step": 1333 + }, + { + "epoch": 0.009564110744549194, + "grad_norm": 2.106307029724121, + "learning_rate": 4.469200524246396e-05, + "loss": 0.7139, + "step": 1364 + }, + { + "epoch": 0.009781476897834401, + "grad_norm": 2.3779594898223877, + "learning_rate": 4.570773263433814e-05, + "loss": 0.7205, + "step": 1395 + }, + { + "epoch": 0.00999884305111961, + "grad_norm": 2.001551866531372, + "learning_rate": 4.672346002621232e-05, + "loss": 0.7162, + "step": 1426 + }, + { + "epoch": 0.01021620920440482, + "grad_norm": 2.6994752883911133, + "learning_rate": 4.77391874180865e-05, + "loss": 0.7215, + "step": 1457 + }, + { + "epoch": 0.010433575357690029, + "grad_norm": 2.071122884750366, + "learning_rate": 4.875491480996068e-05, + "loss": 0.7095, + "step": 1488 + }, + { + "epoch": 0.010650941510975238, + "grad_norm": 2.3666610717773438, + "learning_rate": 4.977064220183487e-05, + "loss": 0.7089, + "step": 1519 + }, + { + "epoch": 0.010868307664260446, + "grad_norm": 2.130204677581787, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.7043, + "step": 1550 + }, + { + "epoch": 0.011085673817545655, + "grad_norm": 1.7413716316223145, + "learning_rate": 4.999955597496219e-05, + "loss": 0.7062, + "step": 1581 + }, + { + "epoch": 0.011303039970830864, + "grad_norm": 1.7087843418121338, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.7175, + "step": 1612 + }, + { + "epoch": 0.011520406124116074, + "grad_norm": 2.574871301651001, + "learning_rate": 4.999799067923527e-05, + "loss": 0.7076, + "step": 1643 + }, + { + "epoch": 0.011737772277401283, + "grad_norm": 2.0530433654785156, + "learning_rate": 4.999678487776908e-05, + "loss": 0.708, + "step": 1674 + }, + { + "epoch": 0.011955138430686492, + "grad_norm": 1.9184463024139404, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.7011, + "step": 1705 + }, + { + "epoch": 0.0121725045839717, + "grad_norm": 1.768115520477295, + "learning_rate": 4.999352703566763e-05, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.012389870737256909, + "grad_norm": 1.6692063808441162, + "learning_rate": 4.999147503179668e-05, + "loss": 0.7019, + "step": 1767 + }, + { + "epoch": 0.012607236890542118, + "grad_norm": 1.803944706916809, + "learning_rate": 4.998914100252672e-05, + "loss": 0.703, + "step": 1798 + }, + { + "epoch": 0.012824603043827328, + "grad_norm": 1.8957322835922241, + "learning_rate": 4.998652497419696e-05, + "loss": 0.6979, + "step": 1829 + }, + { + "epoch": 0.013041969197112537, + "grad_norm": 1.544054627418518, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.7012, + "step": 1860 + }, + { + "epoch": 0.013259335350397744, + "grad_norm": 1.7066351175308228, + "learning_rate": 4.998044704162613e-05, + "loss": 0.6949, + "step": 1891 + }, + { + "epoch": 0.013476701503682954, + "grad_norm": 1.7045214176177979, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.6871, + "step": 1922 + }, + { + "epoch": 0.013694067656968163, + "grad_norm": 2.0414485931396484, + "learning_rate": 4.997324150843799e-05, + "loss": 0.6944, + "step": 1953 + }, + { + "epoch": 0.013911433810253372, + "grad_norm": 1.9210485219955444, + "learning_rate": 4.99692159912661e-05, + "loss": 0.7013, + "step": 1984 + }, + { + "epoch": 0.014128799963538581, + "grad_norm": 1.8733997344970703, + "learning_rate": 4.996490869988546e-05, + "loss": 0.6991, + "step": 2015 + }, + { + "epoch": 0.014346166116823789, + "grad_norm": 1.7118934392929077, + "learning_rate": 4.996031968290326e-05, + "loss": 0.6873, + "step": 2046 + }, + { + "epoch": 0.014563532270108998, + "grad_norm": 1.889208197593689, + "learning_rate": 4.995544899210594e-05, + "loss": 0.692, + "step": 2077 + }, + { + "epoch": 0.014780898423394207, + "grad_norm": 2.0081522464752197, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.6823, + "step": 2108 + }, + { + "epoch": 0.014998264576679417, + "grad_norm": 1.7950328588485718, + "learning_rate": 4.994486281210429e-05, + "loss": 0.686, + "step": 2139 + }, + { + "epoch": 0.015215630729964626, + "grad_norm": 1.758333444595337, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.6827, + "step": 2170 + }, + { + "epoch": 0.015432996883249834, + "grad_norm": 1.496063470840454, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.6896, + "step": 2201 + }, + { + "epoch": 0.015650363036535043, + "grad_norm": 1.6103190183639526, + "learning_rate": 4.992687246588743e-05, + "loss": 0.6865, + "step": 2232 + }, + { + "epoch": 0.015867729189820254, + "grad_norm": 1.7069604396820068, + "learning_rate": 4.992031299767347e-05, + "loss": 0.6836, + "step": 2263 + }, + { + "epoch": 0.01608509534310546, + "grad_norm": 1.8261148929595947, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.692, + "step": 2294 + }, + { + "epoch": 0.01630246149639067, + "grad_norm": 1.7522642612457275, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.6832, + "step": 2325 + }, + { + "epoch": 0.01651982764967588, + "grad_norm": 1.6746042966842651, + "learning_rate": 4.989894757091861e-05, + "loss": 0.6829, + "step": 2356 + }, + { + "epoch": 0.016737193802961087, + "grad_norm": 1.6785473823547363, + "learning_rate": 4.989126368919158e-05, + "loss": 0.6842, + "step": 2387 + }, + { + "epoch": 0.0169545599562463, + "grad_norm": 1.936916470527649, + "learning_rate": 4.988329891293693e-05, + "loss": 0.6798, + "step": 2418 + }, + { + "epoch": 0.017171926109531506, + "grad_norm": 1.5104129314422607, + "learning_rate": 4.987505333203608e-05, + "loss": 0.6743, + "step": 2449 + }, + { + "epoch": 0.017389292262816713, + "grad_norm": 1.3770678043365479, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.6874, + "step": 2480 + }, + { + "epoch": 0.017606658416101924, + "grad_norm": 1.4900861978530884, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.679, + "step": 2511 + }, + { + "epoch": 0.017824024569387132, + "grad_norm": 1.5600172281265259, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.6719, + "step": 2542 + }, + { + "epoch": 0.018041390722672343, + "grad_norm": 1.660237431526184, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.6773, + "step": 2573 + }, + { + "epoch": 0.01825875687595755, + "grad_norm": 1.4618791341781616, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.6767, + "step": 2604 + }, + { + "epoch": 0.018476123029242758, + "grad_norm": 1.5567563772201538, + "learning_rate": 4.981968838155888e-05, + "loss": 0.6706, + "step": 2635 + }, + { + "epoch": 0.01869348918252797, + "grad_norm": 1.4879790544509888, + "learning_rate": 4.980947995086024e-05, + "loss": 0.676, + "step": 2666 + }, + { + "epoch": 0.018910855335813177, + "grad_norm": 1.5527766942977905, + "learning_rate": 4.979899154855234e-05, + "loss": 0.6734, + "step": 2697 + }, + { + "epoch": 0.019128221489098388, + "grad_norm": 1.7922642230987549, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.6813, + "step": 2728 + }, + { + "epoch": 0.019345587642383595, + "grad_norm": 1.6453475952148438, + "learning_rate": 4.977717530570768e-05, + "loss": 0.6749, + "step": 2759 + }, + { + "epoch": 0.019562953795668803, + "grad_norm": 1.4713648557662964, + "learning_rate": 4.976584771136425e-05, + "loss": 0.6761, + "step": 2790 + }, + { + "epoch": 0.019780319948954014, + "grad_norm": 1.5513399839401245, + "learning_rate": 4.975424063779547e-05, + "loss": 0.6739, + "step": 2821 + }, + { + "epoch": 0.01999768610223922, + "grad_norm": 1.431796908378601, + "learning_rate": 4.974235421598557e-05, + "loss": 0.6639, + "step": 2852 + }, + { + "epoch": 0.020215052255524432, + "grad_norm": 1.4959752559661865, + "learning_rate": 4.973018858007122e-05, + "loss": 0.662, + "step": 2883 + }, + { + "epoch": 0.02043241840880964, + "grad_norm": 1.4675205945968628, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.6659, + "step": 2914 + }, + { + "epoch": 0.020649784562094847, + "grad_norm": 1.7111692428588867, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.6732, + "step": 2945 + }, + { + "epoch": 0.020867150715380058, + "grad_norm": 1.542748212814331, + "learning_rate": 4.969201777632205e-05, + "loss": 0.6666, + "step": 2976 + }, + { + "epoch": 0.021084516868665266, + "grad_norm": 1.5013272762298584, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.667, + "step": 3007 + }, + { + "epoch": 0.021301883021950477, + "grad_norm": 1.57589852809906, + "learning_rate": 4.966517710419033e-05, + "loss": 0.6668, + "step": 3038 + }, + { + "epoch": 0.021519249175235684, + "grad_norm": 1.612727403640747, + "learning_rate": 4.965133917685858e-05, + "loss": 0.6653, + "step": 3069 + }, + { + "epoch": 0.021736615328520892, + "grad_norm": 1.5110771656036377, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.6694, + "step": 3100 + }, + { + "epoch": 0.021953981481806103, + "grad_norm": 1.5836044549942017, + "learning_rate": 4.962282892045718e-05, + "loss": 0.6634, + "step": 3131 + }, + { + "epoch": 0.02217134763509131, + "grad_norm": 1.5767654180526733, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.6633, + "step": 3162 + }, + { + "epoch": 0.02238871378837652, + "grad_norm": 1.6058984994888306, + "learning_rate": 4.959320720608049e-05, + "loss": 0.6646, + "step": 3193 + }, + { + "epoch": 0.02260607994166173, + "grad_norm": 1.4564005136489868, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.663, + "step": 3224 + }, + { + "epoch": 0.022823446094946936, + "grad_norm": 1.5274450778961182, + "learning_rate": 4.956247537083282e-05, + "loss": 0.6554, + "step": 3255 + }, + { + "epoch": 0.023040812248232147, + "grad_norm": 1.524122953414917, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.6666, + "step": 3286 + }, + { + "epoch": 0.023258178401517355, + "grad_norm": 1.5017430782318115, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.6661, + "step": 3317 + }, + { + "epoch": 0.023475544554802566, + "grad_norm": 1.4247208833694458, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.6563, + "step": 3348 + }, + { + "epoch": 0.023692910708087773, + "grad_norm": 1.3451093435287476, + "learning_rate": 4.949768693662973e-05, + "loss": 0.6555, + "step": 3379 + }, + { + "epoch": 0.023910276861372984, + "grad_norm": 1.418442726135254, + "learning_rate": 4.948079823064559e-05, + "loss": 0.6587, + "step": 3410 + }, + { + "epoch": 0.024127643014658192, + "grad_norm": 1.5460575819015503, + "learning_rate": 4.946363326218074e-05, + "loss": 0.6602, + "step": 3441 + }, + { + "epoch": 0.0243450091679434, + "grad_norm": 1.4053966999053955, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.6578, + "step": 3472 + }, + { + "epoch": 0.02456237532122861, + "grad_norm": 1.420806646347046, + "learning_rate": 4.942847531574167e-05, + "loss": 0.6544, + "step": 3503 + }, + { + "epoch": 0.024779741474513818, + "grad_norm": 1.3220508098602295, + "learning_rate": 4.941048273452008e-05, + "loss": 0.6642, + "step": 3534 + }, + { + "epoch": 0.02499710762779903, + "grad_norm": 1.3819468021392822, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.6534, + "step": 3565 + }, + { + "epoch": 0.025214473781084237, + "grad_norm": 1.4046236276626587, + "learning_rate": 4.93736713712897e-05, + "loss": 0.6547, + "step": 3596 + }, + { + "epoch": 0.025431839934369444, + "grad_norm": 1.965153455734253, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.6582, + "step": 3627 + }, + { + "epoch": 0.025649206087654655, + "grad_norm": 1.6758291721343994, + "learning_rate": 4.93357597968886e-05, + "loss": 0.6536, + "step": 3658 + }, + { + "epoch": 0.025866572240939863, + "grad_norm": 1.4467246532440186, + "learning_rate": 4.931639196334338e-05, + "loss": 0.6564, + "step": 3689 + }, + { + "epoch": 0.026083938394225074, + "grad_norm": 1.5305490493774414, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.6626, + "step": 3720 + }, + { + "epoch": 0.02630130454751028, + "grad_norm": 1.3826133012771606, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.6624, + "step": 3751 + }, + { + "epoch": 0.02651867070079549, + "grad_norm": 1.4920278787612915, + "learning_rate": 4.925664290937196e-05, + "loss": 0.6417, + "step": 3782 + }, + { + "epoch": 0.0267360368540807, + "grad_norm": 14.533783912658691, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.647, + "step": 3813 + }, + { + "epoch": 0.026953403007365907, + "grad_norm": 1.4659409523010254, + "learning_rate": 4.921544116753962e-05, + "loss": 0.6514, + "step": 3844 + }, + { + "epoch": 0.02717076916065112, + "grad_norm": 1.4820138216018677, + "learning_rate": 4.919443027766935e-05, + "loss": 0.6477, + "step": 3875 + }, + { + "epoch": 0.027388135313936326, + "grad_norm": 1.2543880939483643, + "learning_rate": 4.91731463569418e-05, + "loss": 0.6505, + "step": 3906 + }, + { + "epoch": 0.027605501467221533, + "grad_norm": 1.640926718711853, + "learning_rate": 4.915158964554312e-05, + "loss": 0.6488, + "step": 3937 + }, + { + "epoch": 0.027822867620506744, + "grad_norm": 1.3990615606307983, + "learning_rate": 4.912976038673786e-05, + "loss": 0.658, + "step": 3968 + }, + { + "epoch": 0.028040233773791952, + "grad_norm": 1.2064067125320435, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.6487, + "step": 3999 + }, + { + "epoch": 0.028257599927077163, + "grad_norm": 1.5103769302368164, + "learning_rate": 4.908528521534139e-05, + "loss": 0.6569, + "step": 4030 + }, + { + "epoch": 0.02847496608036237, + "grad_norm": 1.5314087867736816, + "learning_rate": 4.906263980464644e-05, + "loss": 0.6638, + "step": 4061 + }, + { + "epoch": 0.028692332233647578, + "grad_norm": 1.4619168043136597, + "learning_rate": 4.903972285033178e-05, + "loss": 0.6505, + "step": 4092 + }, + { + "epoch": 0.02890969838693279, + "grad_norm": 1.3979246616363525, + "learning_rate": 4.901653461101213e-05, + "loss": 0.6505, + "step": 4123 + }, + { + "epoch": 0.029127064540217996, + "grad_norm": 1.3866580724716187, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.6461, + "step": 4154 + }, + { + "epoch": 0.029344430693503207, + "grad_norm": 1.4662801027297974, + "learning_rate": 4.896934532712084e-05, + "loss": 0.6448, + "step": 4185 + }, + { + "epoch": 0.029561796846788415, + "grad_norm": 1.3171806335449219, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.6525, + "step": 4216 + }, + { + "epoch": 0.029779163000073623, + "grad_norm": 1.3289718627929688, + "learning_rate": 4.892107408306516e-05, + "loss": 0.6445, + "step": 4247 + }, + { + "epoch": 0.029996529153358834, + "grad_norm": 1.3527250289916992, + "learning_rate": 4.889653340498669e-05, + "loss": 0.6449, + "step": 4278 + }, + { + "epoch": 0.03021389530664404, + "grad_norm": 1.314674735069275, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.6559, + "step": 4309 + }, + { + "epoch": 0.030431261459929252, + "grad_norm": 1.2237507104873657, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.6468, + "step": 4340 + }, + { + "epoch": 0.03064862761321446, + "grad_norm": 1.423965573310852, + "learning_rate": 4.882129447892753e-05, + "loss": 0.6465, + "step": 4371 + }, + { + "epoch": 0.030865993766499667, + "grad_norm": 1.271371603012085, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.6471, + "step": 4402 + }, + { + "epoch": 0.031083359919784878, + "grad_norm": 1.5492364168167114, + "learning_rate": 4.876979062282995e-05, + "loss": 0.6444, + "step": 4433 + }, + { + "epoch": 0.031300726073070086, + "grad_norm": 1.5139328241348267, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.6512, + "step": 4464 + }, + { + "epoch": 0.03151809222635529, + "grad_norm": 1.4510358572006226, + "learning_rate": 4.871721381433344e-05, + "loss": 0.6485, + "step": 4495 + }, + { + "epoch": 0.03173545837964051, + "grad_norm": 1.3228867053985596, + "learning_rate": 4.869052379269719e-05, + "loss": 0.6416, + "step": 4526 + }, + { + "epoch": 0.031952824532925715, + "grad_norm": 1.2731959819793701, + "learning_rate": 4.866356642671985e-05, + "loss": 0.6477, + "step": 4557 + }, + { + "epoch": 0.03217019068621092, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.6412, + "step": 4588 + }, + { + "epoch": 0.03238755683949613, + "grad_norm": 1.662154197692871, + "learning_rate": 4.860885088159626e-05, + "loss": 0.6464, + "step": 4619 + }, + { + "epoch": 0.03260492299278134, + "grad_norm": 1.5419702529907227, + "learning_rate": 4.858109331990751e-05, + "loss": 0.6421, + "step": 4650 + }, + { + "epoch": 0.03282228914606655, + "grad_norm": 1.6409112215042114, + "learning_rate": 4.855306964878567e-05, + "loss": 0.6468, + "step": 4681 + }, + { + "epoch": 0.03303965529935176, + "grad_norm": 1.1416597366333008, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.6396, + "step": 4712 + }, + { + "epoch": 0.03325702145263697, + "grad_norm": 1.215846061706543, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.6517, + "step": 4743 + }, + { + "epoch": 0.033474387605922175, + "grad_norm": 1.3075084686279297, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.6389, + "step": 4774 + }, + { + "epoch": 0.03369175375920738, + "grad_norm": 1.6089972257614136, + "learning_rate": 4.843832023980392e-05, + "loss": 0.6371, + "step": 4805 + }, + { + "epoch": 0.0339091199124926, + "grad_norm": 1.3927685022354126, + "learning_rate": 4.840897082510106e-05, + "loss": 0.6355, + "step": 4836 + }, + { + "epoch": 0.034126486065777804, + "grad_norm": 1.197952389717102, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.6415, + "step": 4867 + }, + { + "epoch": 0.03434385221906301, + "grad_norm": 1.2738877534866333, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.6442, + "step": 4898 + }, + { + "epoch": 0.03456121837234822, + "grad_norm": 1.5164271593093872, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.6406, + "step": 4929 + }, + { + "epoch": 0.03477858452563343, + "grad_norm": 1.242473840713501, + "learning_rate": 4.828893485248369e-05, + "loss": 0.6441, + "step": 4960 + }, + { + "epoch": 0.03499595067891864, + "grad_norm": 1.2919869422912598, + "learning_rate": 4.825826797411682e-05, + "loss": 0.6487, + "step": 4991 + }, + { + "epoch": 0.03521331683220385, + "grad_norm": 2.6493895053863525, + "learning_rate": 4.822733862935702e-05, + "loss": 0.6398, + "step": 5022 + }, + { + "epoch": 0.035430682985489056, + "grad_norm": 1.2134305238723755, + "learning_rate": 4.819614716723775e-05, + "loss": 0.6398, + "step": 5053 + }, + { + "epoch": 0.035648049138774264, + "grad_norm": 1.1468703746795654, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.633, + "step": 5084 + }, + { + "epoch": 0.03586541529205947, + "grad_norm": 1.3923726081848145, + "learning_rate": 4.813297930184042e-05, + "loss": 0.6295, + "step": 5115 + }, + { + "epoch": 0.036082781445344686, + "grad_norm": 1.5531644821166992, + "learning_rate": 4.810100361140314e-05, + "loss": 0.6345, + "step": 5146 + }, + { + "epoch": 0.036300147598629894, + "grad_norm": 1.201889991760254, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.6414, + "step": 5177 + }, + { + "epoch": 0.0365175137519151, + "grad_norm": 1.4365577697753906, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.6364, + "step": 5208 + }, + { + "epoch": 0.03673487990520031, + "grad_norm": 1.210980772972107, + "learning_rate": 4.8003513848046e-05, + "loss": 0.6353, + "step": 5239 + }, + { + "epoch": 0.036952246058485516, + "grad_norm": 1.379381775856018, + "learning_rate": 4.79704975853109e-05, + "loss": 0.6293, + "step": 5270 + }, + { + "epoch": 0.03716961221177073, + "grad_norm": 1.2263178825378418, + "learning_rate": 4.793722210363262e-05, + "loss": 0.629, + "step": 5301 + }, + { + "epoch": 0.03738697836505594, + "grad_norm": 1.2448405027389526, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.6474, + "step": 5332 + }, + { + "epoch": 0.037604344518341146, + "grad_norm": 1.257132887840271, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.6426, + "step": 5363 + }, + { + "epoch": 0.03782171067162635, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.783584411463221e-05, + "loss": 0.6396, + "step": 5394 + }, + { + "epoch": 0.03803907682491156, + "grad_norm": 1.3164068460464478, + "learning_rate": 4.780153554146274e-05, + "loss": 0.6343, + "step": 5425 + }, + { + "epoch": 0.038256442978196775, + "grad_norm": 1.2034872770309448, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.629, + "step": 5456 + }, + { + "epoch": 0.03847380913148198, + "grad_norm": 1.3156630992889404, + "learning_rate": 4.773214684850662e-05, + "loss": 0.6403, + "step": 5487 + }, + { + "epoch": 0.03869117528476719, + "grad_norm": 1.0719150304794312, + "learning_rate": 4.769706751176193e-05, + "loss": 0.6318, + "step": 5518 + }, + { + "epoch": 0.0389085414380524, + "grad_norm": 1.3054882287979126, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.6419, + "step": 5549 + }, + { + "epoch": 0.039125907591337605, + "grad_norm": 1.2081729173660278, + "learning_rate": 4.762614083706258e-05, + "loss": 0.6461, + "step": 5580 + }, + { + "epoch": 0.03934327374462282, + "grad_norm": 1.1728904247283936, + "learning_rate": 4.759029429950581e-05, + "loss": 0.6473, + "step": 5611 + }, + { + "epoch": 0.03956063989790803, + "grad_norm": 1.2552399635314941, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.6373, + "step": 5642 + }, + { + "epoch": 0.039778006051193235, + "grad_norm": 1.299212098121643, + "learning_rate": 4.751783684659e-05, + "loss": 0.6234, + "step": 5673 + }, + { + "epoch": 0.03999537220447844, + "grad_norm": 2.0746827125549316, + "learning_rate": 4.748122674890348e-05, + "loss": 0.6264, + "step": 5704 + }, + { + "epoch": 0.04021273835776365, + "grad_norm": 1.3712407350540161, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.6252, + "step": 5735 + }, + { + "epoch": 0.040430104511048864, + "grad_norm": 1.2094186544418335, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.6265, + "step": 5766 + }, + { + "epoch": 0.04064747066433407, + "grad_norm": 1.2487757205963135, + "learning_rate": 4.736987593718397e-05, + "loss": 0.6305, + "step": 5797 + }, + { + "epoch": 0.04086483681761928, + "grad_norm": 1.5805151462554932, + "learning_rate": 4.733225355658999e-05, + "loss": 0.631, + "step": 5828 + }, + { + "epoch": 0.04108220297090449, + "grad_norm": 1.2615118026733398, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.636, + "step": 5859 + }, + { + "epoch": 0.041299569124189695, + "grad_norm": 1.6970707178115845, + "learning_rate": 4.725625317347119e-05, + "loss": 0.6271, + "step": 5890 + }, + { + "epoch": 0.04151693527747491, + "grad_norm": 1.4279624223709106, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.6267, + "step": 5921 + }, + { + "epoch": 0.041734301430760117, + "grad_norm": 1.3471580743789673, + "learning_rate": 4.717924815801832e-05, + "loss": 0.6289, + "step": 5952 + }, + { + "epoch": 0.041951667584045324, + "grad_norm": 1.5088621377944946, + "learning_rate": 4.714036999763532e-05, + "loss": 0.6263, + "step": 5983 + }, + { + "epoch": 0.04216903373733053, + "grad_norm": 1.3061436414718628, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.6266, + "step": 6014 + }, + { + "epoch": 0.04238639989061574, + "grad_norm": 1.1800014972686768, + "learning_rate": 4.7061864565225e-05, + "loss": 0.6259, + "step": 6045 + }, + { + "epoch": 0.042603766043900954, + "grad_norm": 1.195177435874939, + "learning_rate": 4.702223817912081e-05, + "loss": 0.6251, + "step": 6076 + }, + { + "epoch": 0.04282113219718616, + "grad_norm": 1.1905118227005005, + "learning_rate": 4.698236327505195e-05, + "loss": 0.6217, + "step": 6107 + }, + { + "epoch": 0.04303849835047137, + "grad_norm": 1.2270928621292114, + "learning_rate": 4.694224030300127e-05, + "loss": 0.6352, + "step": 6138 + }, + { + "epoch": 0.043255864503756576, + "grad_norm": 1.209226369857788, + "learning_rate": 4.690186971575107e-05, + "loss": 0.6249, + "step": 6169 + }, + { + "epoch": 0.043473230657041784, + "grad_norm": 1.1949187517166138, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.6371, + "step": 6200 + }, + { + "epoch": 0.043690596810327, + "grad_norm": 1.2346535921096802, + "learning_rate": 4.68203875207476e-05, + "loss": 0.6177, + "step": 6231 + }, + { + "epoch": 0.043907962963612206, + "grad_norm": 1.2187124490737915, + "learning_rate": 4.677927683250983e-05, + "loss": 0.6268, + "step": 6262 + }, + { + "epoch": 0.04412532911689741, + "grad_norm": 1.2187339067459106, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.6261, + "step": 6293 + }, + { + "epoch": 0.04434269527018262, + "grad_norm": 1.320764422416687, + "learning_rate": 4.669631859419965e-05, + "loss": 0.6212, + "step": 6324 + }, + { + "epoch": 0.04456006142346783, + "grad_norm": 1.3396878242492676, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.04477742757675304, + "grad_norm": 1.2472412586212158, + "learning_rate": 4.661238099862658e-05, + "loss": 0.6241, + "step": 6386 + }, + { + "epoch": 0.04499479373003825, + "grad_norm": 1.3773880004882812, + "learning_rate": 4.657004612417138e-05, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 0.04521215988332346, + "grad_norm": 1.1602933406829834, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.6221, + "step": 6448 + }, + { + "epoch": 0.045429526036608665, + "grad_norm": 1.1719809770584106, + "learning_rate": 4.648464661063478e-05, + "loss": 0.629, + "step": 6479 + }, + { + "epoch": 0.04564689218989387, + "grad_norm": 1.1973191499710083, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.6239, + "step": 6510 + }, + { + "epoch": 0.04586425834317909, + "grad_norm": 1.1805142164230347, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.6242, + "step": 6541 + }, + { + "epoch": 0.046081624496464295, + "grad_norm": 1.3194178342819214, + "learning_rate": 4.6354730177207e-05, + "loss": 0.6316, + "step": 6572 + }, + { + "epoch": 0.0462989906497495, + "grad_norm": 1.1395046710968018, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.6207, + "step": 6603 + }, + { + "epoch": 0.04651635680303471, + "grad_norm": 1.5031641721725464, + "learning_rate": 4.626691348094777e-05, + "loss": 0.6218, + "step": 6634 + }, + { + "epoch": 0.04673372295631992, + "grad_norm": 1.0985206365585327, + "learning_rate": 4.622264489304762e-05, + "loss": 0.6198, + "step": 6665 + }, + { + "epoch": 0.04695108910960513, + "grad_norm": 1.2820847034454346, + "learning_rate": 4.617813681048434e-05, + "loss": 0.626, + "step": 6696 + }, + { + "epoch": 0.04716845526289034, + "grad_norm": 1.4487061500549316, + "learning_rate": 4.61333897355256e-05, + "loss": 0.622, + "step": 6727 + }, + { + "epoch": 0.04738582141617555, + "grad_norm": 1.1577301025390625, + "learning_rate": 4.608840417313604e-05, + "loss": 0.6244, + "step": 6758 + }, + { + "epoch": 0.047603187569460755, + "grad_norm": 1.1363381147384644, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.6206, + "step": 6789 + }, + { + "epoch": 0.04782055372274597, + "grad_norm": 1.22281813621521, + "learning_rate": 4.599771961937391e-05, + "loss": 0.6231, + "step": 6820 + }, + { + "epoch": 0.048037919876031177, + "grad_norm": 1.156031847000122, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.6223, + "step": 6851 + }, + { + "epoch": 0.048255286029316384, + "grad_norm": 1.1688473224639893, + "learning_rate": 4.590608724263786e-05, + "loss": 0.6293, + "step": 6882 + }, + { + "epoch": 0.04847265218260159, + "grad_norm": 1.4265236854553223, + "learning_rate": 4.585991691155845e-05, + "loss": 0.6109, + "step": 6913 + }, + { + "epoch": 0.0486900183358868, + "grad_norm": 1.1242969036102295, + "learning_rate": 4.581351117915188e-05, + "loss": 0.6256, + "step": 6944 + }, + { + "epoch": 0.048907384489172014, + "grad_norm": 1.3943792581558228, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.6093, + "step": 6975 + }, + { + "epoch": 0.04912475064245722, + "grad_norm": 1.0761889219284058, + "learning_rate": 4.571999560773736e-05, + "loss": 0.6207, + "step": 7006 + }, + { + "epoch": 0.04934211679574243, + "grad_norm": 1.2784191370010376, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.6227, + "step": 7037 + }, + { + "epoch": 0.049559482949027636, + "grad_norm": 1.4215577840805054, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.6148, + "step": 7068 + }, + { + "epoch": 0.049776849102312844, + "grad_norm": 1.1120682954788208, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.6192, + "step": 7099 + }, + { + "epoch": 0.04999421525559806, + "grad_norm": 1.1718186140060425, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.6179, + "step": 7130 + }, + { + "epoch": 0.050211581408883266, + "grad_norm": 1.5078628063201904, + "learning_rate": 4.548212413768558e-05, + "loss": 0.6245, + "step": 7161 + }, + { + "epoch": 0.05042894756216847, + "grad_norm": 1.1740144491195679, + "learning_rate": 4.543385426912261e-05, + "loss": 0.6242, + "step": 7192 + }, + { + "epoch": 0.05064631371545368, + "grad_norm": 1.2450133562088013, + "learning_rate": 4.53853538072915e-05, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 0.05086367986873889, + "grad_norm": 1.181887149810791, + "learning_rate": 4.533662329951336e-05, + "loss": 0.6188, + "step": 7254 + }, + { + "epoch": 0.0510810460220241, + "grad_norm": 1.172691822052002, + "learning_rate": 4.528766329570536e-05, + "loss": 0.6191, + "step": 7285 + }, + { + "epoch": 0.05129841217530931, + "grad_norm": 1.148863673210144, + "learning_rate": 4.523847434837447e-05, + "loss": 0.6234, + "step": 7316 + }, + { + "epoch": 0.05151577832859452, + "grad_norm": 1.2533507347106934, + "learning_rate": 4.518905701261128e-05, + "loss": 0.6166, + "step": 7347 + }, + { + "epoch": 0.051733144481879725, + "grad_norm": 1.1888121366500854, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.6176, + "step": 7378 + }, + { + "epoch": 0.05195051063516493, + "grad_norm": 1.0766541957855225, + "learning_rate": 4.508953940903073e-05, + "loss": 0.6137, + "step": 7409 + }, + { + "epoch": 0.05216787678845015, + "grad_norm": 1.2277822494506836, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.6231, + "step": 7440 + }, + { + "epoch": 0.052385242941735355, + "grad_norm": 1.0808931589126587, + "learning_rate": 4.498911497712155e-05, + "loss": 0.6209, + "step": 7471 + }, + { + "epoch": 0.05260260909502056, + "grad_norm": 1.0840133428573608, + "learning_rate": 4.493856411554142e-05, + "loss": 0.6144, + "step": 7502 + }, + { + "epoch": 0.05281997524830577, + "grad_norm": 1.5881050825119019, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.6114, + "step": 7533 + }, + { + "epoch": 0.05303734140159098, + "grad_norm": 1.1180490255355835, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.6134, + "step": 7564 + }, + { + "epoch": 0.05325470755487619, + "grad_norm": 1.1469063758850098, + "learning_rate": 4.478556380141218e-05, + "loss": 0.6122, + "step": 7595 + }, + { + "epoch": 0.0534720737081614, + "grad_norm": 1.2977004051208496, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.615, + "step": 7626 + }, + { + "epoch": 0.05368943986144661, + "grad_norm": 1.0556434392929077, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.6187, + "step": 7657 + }, + { + "epoch": 0.053906806014731815, + "grad_norm": 1.101298451423645, + "learning_rate": 4.463055400581586e-05, + "loss": 0.6123, + "step": 7688 + }, + { + "epoch": 0.05412417216801702, + "grad_norm": 1.262608528137207, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.6149, + "step": 7719 + }, + { + "epoch": 0.05434153832130224, + "grad_norm": 1.1554538011550903, + "learning_rate": 4.452610552959183e-05, + "loss": 0.6168, + "step": 7750 + }, + { + "epoch": 0.054558904474587444, + "grad_norm": 1.1653157472610474, + "learning_rate": 4.447355047201428e-05, + "loss": 0.615, + "step": 7781 + }, + { + "epoch": 0.05477627062787265, + "grad_norm": 1.0511231422424316, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.6215, + "step": 7812 + }, + { + "epoch": 0.05499363678115786, + "grad_norm": 1.3468266725540161, + "learning_rate": 4.436778168330484e-05, + "loss": 0.6221, + "step": 7843 + }, + { + "epoch": 0.05521100293444307, + "grad_norm": 1.3238797187805176, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.611, + "step": 7874 + }, + { + "epoch": 0.05542836908772828, + "grad_norm": 1.1107763051986694, + "learning_rate": 4.42611386459262e-05, + "loss": 0.6119, + "step": 7905 + }, + { + "epoch": 0.05564573524101349, + "grad_norm": 1.3667259216308594, + "learning_rate": 4.420749078676133e-05, + "loss": 0.6134, + "step": 7936 + }, + { + "epoch": 0.055863101394298696, + "grad_norm": 1.0336949825286865, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.613, + "step": 7967 + }, + { + "epoch": 0.056080467547583904, + "grad_norm": 1.0911316871643066, + "learning_rate": 4.409954541451762e-05, + "loss": 0.6107, + "step": 7998 + }, + { + "epoch": 0.05629783370086911, + "grad_norm": 1.2516382932662964, + "learning_rate": 4.404524911958764e-05, + "loss": 0.6138, + "step": 8029 + }, + { + "epoch": 0.056515199854154326, + "grad_norm": 1.1678277254104614, + "learning_rate": 4.399073790160989e-05, + "loss": 0.6162, + "step": 8060 + }, + { + "epoch": 0.05673256600743953, + "grad_norm": 1.4072996377944946, + "learning_rate": 4.393601237573607e-05, + "loss": 0.614, + "step": 8091 + }, + { + "epoch": 0.05694993216072474, + "grad_norm": 1.2312837839126587, + "learning_rate": 4.388107315953628e-05, + "loss": 0.6203, + "step": 8122 + }, + { + "epoch": 0.05716729831400995, + "grad_norm": 1.1720649003982544, + "learning_rate": 4.382592087299212e-05, + "loss": 0.619, + "step": 8153 + }, + { + "epoch": 0.057384664467295156, + "grad_norm": 1.0711950063705444, + "learning_rate": 4.377055613848964e-05, + "loss": 0.6124, + "step": 8184 + }, + { + "epoch": 0.05760203062058037, + "grad_norm": 1.1664263010025024, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.6073, + "step": 8215 + }, + { + "epoch": 0.05781939677386558, + "grad_norm": 0.9937831163406372, + "learning_rate": 4.365919182713416e-05, + "loss": 0.6101, + "step": 8246 + }, + { + "epoch": 0.058036762927150785, + "grad_norm": 1.0545841455459595, + "learning_rate": 4.360319350701226e-05, + "loss": 0.6089, + "step": 8277 + }, + { + "epoch": 0.05825412908043599, + "grad_norm": 1.120007038116455, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.6078, + "step": 8308 + }, + { + "epoch": 0.0584714952337212, + "grad_norm": 1.1087831258773804, + "learning_rate": 4.349056769754021e-05, + "loss": 0.601, + "step": 8339 + }, + { + "epoch": 0.058688861387006415, + "grad_norm": 1.1370675563812256, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.6103, + "step": 8370 + }, + { + "epoch": 0.05890622754029162, + "grad_norm": 1.182654857635498, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.6115, + "step": 8401 + }, + { + "epoch": 0.05912359369357683, + "grad_norm": 1.2606432437896729, + "learning_rate": 4.332006561018488e-05, + "loss": 0.6139, + "step": 8432 + }, + { + "epoch": 0.05934095984686204, + "grad_norm": 1.1618047952651978, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.6053, + "step": 8463 + }, + { + "epoch": 0.059558326000147245, + "grad_norm": 1.1281821727752686, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.6084, + "step": 8494 + }, + { + "epoch": 0.05977569215343246, + "grad_norm": 1.15248703956604, + "learning_rate": 4.314770288177384e-05, + "loss": 0.6128, + "step": 8525 + }, + { + "epoch": 0.05999305830671767, + "grad_norm": 1.2170960903167725, + "learning_rate": 4.308983818344313e-05, + "loss": 0.6134, + "step": 8556 + }, + { + "epoch": 0.060210424460002875, + "grad_norm": 1.085260033607483, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.6212, + "step": 8587 + }, + { + "epoch": 0.06042779061328808, + "grad_norm": 1.2243624925613403, + "learning_rate": 4.297349701798505e-05, + "loss": 0.605, + "step": 8618 + }, + { + "epoch": 0.06064515676657329, + "grad_norm": 1.124769687652588, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.6074, + "step": 8649 + }, + { + "epoch": 0.060862522919858504, + "grad_norm": 1.893545150756836, + "learning_rate": 4.285634454093198e-05, + "loss": 0.6055, + "step": 8680 + }, + { + "epoch": 0.06107988907314371, + "grad_norm": 1.2186870574951172, + "learning_rate": 4.279746571169086e-05, + "loss": 0.6121, + "step": 8711 + }, + { + "epoch": 0.06129725522642892, + "grad_norm": 1.0006957054138184, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.6079, + "step": 8742 + }, + { + "epoch": 0.06151462137971413, + "grad_norm": 1.0913995504379272, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.6107, + "step": 8773 + }, + { + "epoch": 0.061731987532999334, + "grad_norm": 1.4647611379623413, + "learning_rate": 4.261962684116106e-05, + "loss": 0.6188, + "step": 8804 + }, + { + "epoch": 0.06194935368628455, + "grad_norm": 1.0805052518844604, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.6137, + "step": 8835 + }, + { + "epoch": 0.062166719839569756, + "grad_norm": 1.0512675046920776, + "learning_rate": 4.250007230372134e-05, + "loss": 0.6052, + "step": 8866 + }, + { + "epoch": 0.062384085992854964, + "grad_norm": 1.1758863925933838, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.6148, + "step": 8897 + }, + { + "epoch": 0.06260145214614017, + "grad_norm": 1.0526351928710938, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.5973, + "step": 8928 + }, + { + "epoch": 0.06281881829942539, + "grad_norm": 1.077563762664795, + "learning_rate": 4.231926105340768e-05, + "loss": 0.6114, + "step": 8959 + }, + { + "epoch": 0.06303618445271059, + "grad_norm": 1.0116938352584839, + "learning_rate": 4.225859883654776e-05, + "loss": 0.6146, + "step": 8990 + }, + { + "epoch": 0.0632535506059958, + "grad_norm": 1.1967271566390991, + "learning_rate": 4.219774185874569e-05, + "loss": 0.5972, + "step": 9021 + }, + { + "epoch": 0.06347091675928102, + "grad_norm": 1.1610004901885986, + "learning_rate": 4.213669080676418e-05, + "loss": 0.6081, + "step": 9052 + }, + { + "epoch": 0.06368828291256622, + "grad_norm": 1.0253360271453857, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.6118, + "step": 9083 + }, + { + "epoch": 0.06390564906585143, + "grad_norm": 1.140599012374878, + "learning_rate": 4.201400923825648e-05, + "loss": 0.6091, + "step": 9114 + }, + { + "epoch": 0.06412301521913663, + "grad_norm": 1.0821545124053955, + "learning_rate": 4.195238010617511e-05, + "loss": 0.6028, + "step": 9145 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.754968236539773e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9156/training_args.bin b/checkpoint-9156/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a57652df56990298f489cbbc4d12b0a5d66dac --- /dev/null +++ b/checkpoint-9156/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a84b6c2a69c365aedcd04f176d62d0993a049ac50b28b1fc7fee54f4267a8 +size 5304 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/model-00001-of-00007.safetensors b/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..97abb2f17a827f9774d5aa677414a589f23ef5be --- /dev/null +++ b/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c41e81deedcb92f32c0979819265f9e6d661b5e330e34fa8c6bef52aea07f00 +size 4886466168 diff --git a/model-00002-of-00007.safetensors b/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/model-00003-of-00007.safetensors b/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/model-00004-of-00007.safetensors b/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/model-00005-of-00007.safetensors b/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/model-00006-of-00007.safetensors b/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea06fe8cbc640bfc1ac1a9c2602ab737f12e0995 --- /dev/null +++ b/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16591b9c3d31c4b6c7f17ad0dab0336936bc07e351f0024897753b9bf6ba44ec +size 4999813120 diff --git a/model-00007-of-00007.safetensors b/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7557ca0d07977541c067f3000ab306980af7cff4 --- /dev/null +++ b/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfa19a4c3bb04e7e6fb6b1c326503b5fdfcbd7288ffaf21fe85973e99e4141c0 +size 2571158184 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,410563 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 128000, + "content": "<|begin_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128001, + "content": "<|end_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128002, + "content": "<|reserved_special_token_0|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128003, + "content": "<|reserved_special_token_1|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128004, + "content": "<|finetune_right_pad_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128005, + "content": "<|reserved_special_token_2|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128006, + "content": "<|start_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128007, + "content": "<|end_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128008, + "content": "<|eom_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128009, + "content": "<|eot_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128010, + "content": "<|python_tag|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128011, + "content": "<|reserved_special_token_3|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128012, + "content": "<|reserved_special_token_4|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128013, + "content": "<|reserved_special_token_5|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128014, + "content": "<|reserved_special_token_6|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128015, + "content": "<|reserved_special_token_7|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128016, + "content": "<|reserved_special_token_8|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128017, + "content": "<|reserved_special_token_9|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128018, + "content": "<|reserved_special_token_10|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128019, + "content": "<|reserved_special_token_11|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128020, + "content": "<|reserved_special_token_12|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128021, + "content": "<|reserved_special_token_13|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128022, + "content": "<|reserved_special_token_14|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128023, + "content": "<|reserved_special_token_15|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128024, + "content": "<|reserved_special_token_16|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128025, + "content": "<|reserved_special_token_17|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128026, + "content": "<|reserved_special_token_18|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128027, + "content": "<|reserved_special_token_19|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128028, + "content": "<|reserved_special_token_20|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128029, + "content": "<|reserved_special_token_21|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128030, + "content": "<|reserved_special_token_22|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128031, + "content": "<|reserved_special_token_23|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128032, + "content": "<|reserved_special_token_24|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128033, + "content": "<|reserved_special_token_25|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128034, + "content": "<|reserved_special_token_26|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128035, + "content": "<|reserved_special_token_27|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128036, + "content": "<|reserved_special_token_28|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128037, + "content": "<|reserved_special_token_29|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128038, + "content": "<|reserved_special_token_30|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128039, + "content": "<|reserved_special_token_31|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128040, + "content": "<|reserved_special_token_32|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128041, + "content": "<|reserved_special_token_33|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128042, + "content": "<|reserved_special_token_34|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128043, + "content": "<|reserved_special_token_35|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128044, + "content": "<|reserved_special_token_36|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128045, + "content": "<|reserved_special_token_37|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128046, + "content": "<|reserved_special_token_38|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128047, + "content": "<|reserved_special_token_39|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128048, + "content": "<|reserved_special_token_40|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128049, + "content": "<|reserved_special_token_41|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128050, + "content": "<|reserved_special_token_42|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128051, + "content": "<|reserved_special_token_43|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128052, + "content": "<|reserved_special_token_44|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128053, + "content": "<|reserved_special_token_45|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128054, + "content": "<|reserved_special_token_46|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128055, + "content": "<|reserved_special_token_47|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128056, + "content": "<|reserved_special_token_48|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128057, + "content": "<|reserved_special_token_49|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128058, + "content": "<|reserved_special_token_50|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128059, + "content": "<|reserved_special_token_51|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128060, + "content": "<|reserved_special_token_52|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128061, + "content": "<|reserved_special_token_53|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128062, + "content": "<|reserved_special_token_54|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128063, + "content": "<|reserved_special_token_55|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128064, + "content": "<|reserved_special_token_56|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128065, + "content": "<|reserved_special_token_57|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128066, + "content": "<|reserved_special_token_58|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128067, + "content": "<|reserved_special_token_59|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128068, + "content": "<|reserved_special_token_60|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128069, + "content": "<|reserved_special_token_61|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128070, + "content": "<|reserved_special_token_62|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128071, + "content": "<|reserved_special_token_63|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128072, + "content": "<|reserved_special_token_64|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128073, + "content": "<|reserved_special_token_65|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128074, + "content": "<|reserved_special_token_66|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128075, + "content": "<|reserved_special_token_67|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128076, + "content": "<|reserved_special_token_68|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128077, + "content": "<|reserved_special_token_69|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128078, + "content": "<|reserved_special_token_70|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128079, + "content": "<|reserved_special_token_71|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128080, + "content": "<|reserved_special_token_72|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128081, + "content": "<|reserved_special_token_73|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128082, + "content": "<|reserved_special_token_74|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128083, + "content": "<|reserved_special_token_75|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128084, + "content": "<|reserved_special_token_76|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128085, + "content": "<|reserved_special_token_77|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128086, + "content": "<|reserved_special_token_78|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128087, + "content": "<|reserved_special_token_79|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128088, + "content": "<|reserved_special_token_80|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128089, + "content": "<|reserved_special_token_81|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128090, + "content": "<|reserved_special_token_82|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128091, + "content": "<|reserved_special_token_83|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128092, + "content": "<|reserved_special_token_84|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128093, + "content": "<|reserved_special_token_85|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128094, + "content": "<|reserved_special_token_86|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128095, + "content": "<|reserved_special_token_87|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128096, + "content": "<|reserved_special_token_88|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128097, + "content": "<|reserved_special_token_89|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128098, + "content": "<|reserved_special_token_90|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128099, + "content": "<|reserved_special_token_91|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128100, + "content": "<|reserved_special_token_92|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128101, + "content": "<|reserved_special_token_93|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128102, + "content": "<|reserved_special_token_94|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128103, + "content": "<|reserved_special_token_95|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128104, + "content": "<|reserved_special_token_96|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128105, + "content": "<|reserved_special_token_97|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128106, + "content": "<|reserved_special_token_98|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128107, + "content": "<|reserved_special_token_99|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128108, + "content": "<|reserved_special_token_100|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128109, + "content": "<|reserved_special_token_101|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128110, + "content": "<|reserved_special_token_102|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128111, + "content": "<|reserved_special_token_103|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128112, + "content": "<|reserved_special_token_104|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128113, + "content": "<|reserved_special_token_105|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128114, + "content": "<|reserved_special_token_106|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128115, + "content": "<|reserved_special_token_107|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128116, + "content": "<|reserved_special_token_108|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128117, + "content": "<|reserved_special_token_109|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128118, + "content": "<|reserved_special_token_110|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128119, + "content": "<|reserved_special_token_111|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128120, + "content": "<|reserved_special_token_112|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128121, + "content": "<|reserved_special_token_113|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128122, + "content": "<|reserved_special_token_114|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128123, + "content": "<|reserved_special_token_115|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128124, + "content": "<|reserved_special_token_116|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128125, + "content": "<|reserved_special_token_117|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128126, + "content": "<|reserved_special_token_118|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128127, + "content": "<|reserved_special_token_119|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128128, + "content": "<|reserved_special_token_120|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128129, + "content": "<|reserved_special_token_121|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128130, + "content": "<|reserved_special_token_122|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128131, + "content": "<|reserved_special_token_123|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128132, + "content": "<|reserved_special_token_124|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128133, + "content": "<|reserved_special_token_125|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128134, + "content": "<|reserved_special_token_126|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128135, + "content": "<|reserved_special_token_127|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128136, + "content": "<|reserved_special_token_128|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128137, + "content": "<|reserved_special_token_129|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128138, + "content": "<|reserved_special_token_130|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128139, + "content": "<|reserved_special_token_131|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128140, + "content": "<|reserved_special_token_132|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128141, + "content": "<|reserved_special_token_133|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128142, + "content": "<|reserved_special_token_134|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128143, + "content": "<|reserved_special_token_135|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128144, + "content": "<|reserved_special_token_136|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128145, + "content": "<|reserved_special_token_137|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128146, + "content": "<|reserved_special_token_138|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128147, + "content": "<|reserved_special_token_139|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128148, + "content": "<|reserved_special_token_140|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128149, + "content": "<|reserved_special_token_141|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128150, + "content": "<|reserved_special_token_142|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128151, + "content": "<|reserved_special_token_143|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128152, + "content": "<|reserved_special_token_144|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128153, + "content": "<|reserved_special_token_145|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128154, + "content": "<|reserved_special_token_146|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128155, + "content": "<|reserved_special_token_147|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128156, + "content": "<|reserved_special_token_148|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128157, + "content": "<|reserved_special_token_149|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128158, + "content": "<|reserved_special_token_150|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128159, + "content": "<|reserved_special_token_151|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128160, + "content": "<|reserved_special_token_152|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128161, + "content": "<|reserved_special_token_153|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128162, + "content": "<|reserved_special_token_154|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128163, + "content": "<|reserved_special_token_155|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128164, + "content": "<|reserved_special_token_156|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128165, + "content": "<|reserved_special_token_157|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128166, + "content": "<|reserved_special_token_158|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128167, + "content": "<|reserved_special_token_159|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128168, + "content": "<|reserved_special_token_160|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128169, + "content": "<|reserved_special_token_161|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128170, + "content": "<|reserved_special_token_162|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128171, + "content": "<|reserved_special_token_163|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128172, + "content": "<|reserved_special_token_164|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128173, + "content": "<|reserved_special_token_165|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128174, + "content": "<|reserved_special_token_166|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128175, + "content": "<|reserved_special_token_167|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128176, + "content": "<|reserved_special_token_168|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128177, + "content": "<|reserved_special_token_169|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128178, + "content": "<|reserved_special_token_170|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128179, + "content": "<|reserved_special_token_171|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128180, + "content": "<|reserved_special_token_172|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128181, + "content": "<|reserved_special_token_173|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128182, + "content": "<|reserved_special_token_174|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128183, + "content": "<|reserved_special_token_175|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128184, + "content": "<|reserved_special_token_176|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128185, + "content": "<|reserved_special_token_177|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128186, + "content": "<|reserved_special_token_178|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128187, + "content": "<|reserved_special_token_179|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128188, + "content": "<|reserved_special_token_180|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128189, + "content": "<|reserved_special_token_181|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128190, + "content": "<|reserved_special_token_182|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128191, + "content": "<|reserved_special_token_183|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128192, + "content": "<|reserved_special_token_184|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128193, + "content": "<|reserved_special_token_185|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128194, + "content": "<|reserved_special_token_186|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128195, + "content": "<|reserved_special_token_187|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128196, + "content": "<|reserved_special_token_188|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128197, + "content": "<|reserved_special_token_189|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128198, + "content": "<|reserved_special_token_190|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128199, + "content": "<|reserved_special_token_191|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128200, + "content": "<|reserved_special_token_192|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128201, + "content": "<|reserved_special_token_193|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128202, + "content": "<|reserved_special_token_194|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128203, + "content": "<|reserved_special_token_195|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128204, + "content": "<|reserved_special_token_196|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128205, + "content": "<|reserved_special_token_197|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128206, + "content": "<|reserved_special_token_198|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128207, + "content": "<|reserved_special_token_199|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128208, + "content": "<|reserved_special_token_200|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128209, + "content": "<|reserved_special_token_201|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128210, + "content": "<|reserved_special_token_202|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128211, + "content": "<|reserved_special_token_203|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128212, + "content": "<|reserved_special_token_204|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128213, + "content": "<|reserved_special_token_205|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128214, + "content": "<|reserved_special_token_206|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128215, + "content": "<|reserved_special_token_207|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128216, + "content": "<|reserved_special_token_208|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128217, + "content": "<|reserved_special_token_209|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128218, + "content": "<|reserved_special_token_210|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128219, + "content": "<|reserved_special_token_211|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128220, + "content": "<|reserved_special_token_212|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128221, + "content": "<|reserved_special_token_213|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128222, + "content": "<|reserved_special_token_214|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128223, + "content": "<|reserved_special_token_215|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128224, + "content": "<|reserved_special_token_216|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128225, + "content": "<|reserved_special_token_217|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128226, + "content": "<|reserved_special_token_218|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128227, + "content": "<|reserved_special_token_219|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128228, + "content": "<|reserved_special_token_220|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128229, + "content": "<|reserved_special_token_221|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128230, + "content": "<|reserved_special_token_222|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128231, + "content": "<|reserved_special_token_223|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128232, + "content": "<|reserved_special_token_224|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128233, + "content": "<|reserved_special_token_225|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128234, + "content": "<|reserved_special_token_226|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128235, + "content": "<|reserved_special_token_227|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128236, + "content": "<|reserved_special_token_228|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128237, + "content": "<|reserved_special_token_229|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128238, + "content": "<|reserved_special_token_230|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128239, + "content": "<|reserved_special_token_231|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128240, + "content": "<|reserved_special_token_232|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128241, + "content": "<|reserved_special_token_233|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128242, + "content": "<|reserved_special_token_234|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128243, + "content": "<|reserved_special_token_235|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128244, + "content": "<|reserved_special_token_236|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128245, + "content": "<|reserved_special_token_237|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128246, + "content": "<|reserved_special_token_238|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128247, + "content": "<|reserved_special_token_239|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128248, + "content": "<|reserved_special_token_240|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128249, + "content": "<|reserved_special_token_241|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128250, + "content": "<|reserved_special_token_242|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128251, + "content": "<|reserved_special_token_243|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128252, + "content": "<|reserved_special_token_244|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128253, + "content": "<|reserved_special_token_245|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128254, + "content": "<|reserved_special_token_246|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128255, + "content": "<|reserved_special_token_247|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 128000 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": true, + "vocab": { + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "¡": 94, + "¢": 95, + "£": 96, + "¤": 97, + "¥": 98, + "¦": 99, + "§": 100, + "¨": 101, + "©": 102, + "ª": 103, + "«": 104, + "¬": 105, + "®": 106, + "¯": 107, + "°": 108, + "±": 109, + "²": 110, + "³": 111, + "´": 112, + "µ": 113, + "¶": 114, + "·": 115, + "¸": 116, + "¹": 117, + "º": 118, + "»": 119, + "¼": 120, + "½": 121, + "¾": 122, + "¿": 123, + "À": 124, + "Á": 125, + "Â": 126, + "Ã": 127, + "Ä": 128, + "Å": 129, + "Æ": 130, + "Ç": 131, + "È": 132, + "É": 133, + "Ê": 134, + "Ë": 135, + "Ì": 136, + "Í": 137, + "Î": 138, + "Ï": 139, + "Ð": 140, + "Ñ": 141, + "Ò": 142, + "Ó": 143, + "Ô": 144, + "Õ": 145, + "Ö": 146, + "×": 147, + "Ø": 148, + "Ù": 149, + "Ú": 150, + "Û": 151, + "Ü": 152, + "Ý": 153, + "Þ": 154, + "ß": 155, + "à": 156, + "á": 157, + "â": 158, + "ã": 159, + "ä": 160, + "å": 161, + "æ": 162, + "ç": 163, + "è": 164, + "é": 165, + "ê": 166, + "ë": 167, + "ì": 168, + "í": 169, + "î": 170, + "ï": 171, + "ð": 172, + "ñ": 173, + "ò": 174, + "ó": 175, + "ô": 176, + "õ": 177, + "ö": 178, + "÷": 179, + "ø": 180, + "ù": 181, + "ú": 182, + "û": 183, + "ü": 184, + "ý": 185, + "þ": 186, + "ÿ": 187, + "Ā": 188, + "ā": 189, + "Ă": 190, + "ă": 191, + "Ą": 192, + "ą": 193, + "Ć": 194, + "ć": 195, + "Ĉ": 196, + "ĉ": 197, + "Ċ": 198, + "ċ": 199, + "Č": 200, + "č": 201, + "Ď": 202, + "ď": 203, + "Đ": 204, + "đ": 205, + "Ē": 206, + "ē": 207, + "Ĕ": 208, + "ĕ": 209, + "Ė": 210, + "ė": 211, + "Ę": 212, + "ę": 213, + "Ě": 214, + "ě": 215, + "Ĝ": 216, + "ĝ": 217, + "Ğ": 218, + "ğ": 219, + "Ġ": 220, + "ġ": 221, + "Ģ": 222, + "ģ": 223, + "Ĥ": 224, + "ĥ": 225, + "Ħ": 226, + "ħ": 227, + "Ĩ": 228, + "ĩ": 229, + "Ī": 230, + "ī": 231, + "Ĭ": 232, + "ĭ": 233, + "Į": 234, + "į": 235, + "İ": 236, + "ı": 237, + "IJ": 238, + "ij": 239, + "Ĵ": 240, + "ĵ": 241, + "Ķ": 242, + "ķ": 243, + "ĸ": 244, + "Ĺ": 245, + "ĺ": 246, + "Ļ": 247, + "ļ": 248, + "Ľ": 249, + "ľ": 250, + "Ŀ": 251, + "ŀ": 252, + "Ł": 253, + "ł": 254, + "Ń": 255, + "ĠĠ": 256, + "ĠĠĠĠ": 257, + "in": 258, + "Ġt": 259, + "ĠĠĠĠĠĠĠĠ": 260, + "er": 261, + "ĠĠĠ": 262, + "on": 263, + "Ġa": 264, + "re": 265, + "at": 266, + "st": 267, + "en": 268, + "or": 269, + "Ġth": 270, + "ĊĊ": 271, + "Ġc": 272, + "le": 273, + "Ġs": 274, + "it": 275, + "an": 276, + "ar": 277, + "al": 278, + "Ġthe": 279, + ";Ċ": 280, + "Ġp": 281, + "Ġf": 282, + "ou": 283, + "Ġ=": 284, + "is": 285, + "ĠĠĠĠĠĠĠ": 286, + "ing": 287, + "es": 288, + "Ġw": 289, + "ion": 290, + "ed": 291, + "ic": 292, + "Ġb": 293, + "Ġd": 294, + "et": 295, + "Ġm": 296, + "Ġo": 297, + "ĉĉ": 298, + "ro": 299, + "as": 300, + "el": 301, + "ct": 302, + "nd": 303, + "Ġin": 304, + "Ġh": 305, + "ent": 306, + "id": 307, + "Ġn": 308, + "am": 309, + "ĠĠĠĠĠĠĠĠĠĠĠ": 310, + "Ġto": 311, + "Ġre": 312, + "--": 313, + "Ġ{": 314, + "Ġof": 315, + "om": 316, + ");Ċ": 317, + "im": 318, + "čĊ": 319, + "Ġ(": 320, + "il": 321, + "//": 322, + "Ġand": 323, + "ur": 324, + "se": 325, + "Ġl": 326, + "ex": 327, + "ĠS": 328, + "ad": 329, + "Ġ\"": 330, + "ch": 331, + "ut": 332, + "if": 333, + "**": 334, + "Ġ}": 335, + "em": 336, + "ol": 337, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 338, + "th": 339, + ")Ċ": 340, + "Ġ{Ċ": 341, + "Ġg": 342, + "ig": 343, + "iv": 344, + ",Ċ": 345, + "ce": 346, + "od": 347, + "Ġv": 348, + "ate": 349, + "ĠT": 350, + "ag": 351, + "ay": 352, + "Ġ*": 353, + "ot": 354, + "us": 355, + "ĠC": 356, + "Ġst": 357, + "ĠI": 358, + "un": 359, + "ul": 360, + "ue": 361, + "ĠA": 362, + "ow": 363, + "Ġ'": 364, + "ew": 365, + "Ġ<": 366, + "ation": 367, + "()": 368, + "Ġfor": 369, + "ab": 370, + "ort": 371, + "um": 372, + "ame": 373, + "Ġis": 374, + "pe": 375, + "tr": 376, + "ck": 377, + "âĢ": 378, + "Ġy": 379, + "ist": 380, + "----": 381, + ".ĊĊ": 382, + "he": 383, + "Ġe": 384, + "lo": 385, + "ĠM": 386, + "Ġbe": 387, + "ers": 388, + "Ġon": 389, + "Ġcon": 390, + "ap": 391, + "ub": 392, + "ĠP": 393, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 394, + "ass": 395, + "int": 396, + ">Ċ": 397, + "ly": 398, + "urn": 399, + "Ġ$": 400, + ";ĊĊ": 401, + "av": 402, + "port": 403, + "ir": 404, + "->": 405, + "nt": 406, + "ction": 407, + "end": 408, + "Ġde": 409, + "00": 410, + "ith": 411, + "out": 412, + "turn": 413, + "our": 414, + "ĠĠĠĠĠ": 415, + "lic": 416, + "res": 417, + "pt": 418, + "==": 419, + "Ġthis": 420, + "Ġwh": 421, + "Ġif": 422, + "ĠD": 423, + "ver": 424, + "age": 425, + "ĠB": 426, + "ht": 427, + "ext": 428, + "=\"": 429, + "Ġthat": 430, + "****": 431, + "ĠR": 432, + "Ġit": 433, + "ess": 434, + "ĠF": 435, + "Ġr": 436, + "os": 437, + "and": 438, + "Ġas": 439, + "ect": 440, + "ke": 441, + "rom": 442, + "Ġ//": 443, + "con": 444, + "ĠL": 445, + "(\"": 446, + "qu": 447, + "lass": 448, + "Ġwith": 449, + "iz": 450, + "de": 451, + "ĠN": 452, + "Ġal": 453, + "op": 454, + "up": 455, + "get": 456, + "Ġ}Ċ": 457, + "ile": 458, + "Ġan": 459, + "ata": 460, + "ore": 461, + "ri": 462, + "Ġpro": 463, + ";čĊ": 464, + "ĉĉĉĉ": 465, + "ter": 466, + "ain": 467, + "ĠW": 468, + "ĠE": 469, + "Ġcom": 470, + "Ġreturn": 471, + "art": 472, + "ĠH": 473, + "ack": 474, + "import": 475, + "ublic": 476, + "Ġor": 477, + "est": 478, + "ment": 479, + "ĠG": 480, + "able": 481, + "Ġ-": 482, + "ine": 483, + "ill": 484, + "ind": 485, + "ere": 486, + "::": 487, + "ity": 488, + "Ġ+": 489, + "Ġtr": 490, + "elf": 491, + "ight": 492, + "('": 493, + "orm": 494, + "ult": 495, + "str": 496, + "..": 497, + "\",": 498, + "Ġyou": 499, + "ype": 500, + "pl": 501, + "Ġnew": 502, + "Ġj": 503, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 504, + "Ġfrom": 505, + "Ġex": 506, + "ĠO": 507, + "20": 508, + "ld": 509, + "Ġ[": 510, + "oc": 511, + ":Ċ": 512, + "Ġse": 513, + "Ġle": 514, + "--------": 515, + ".s": 516, + "{Ċ": 517, + "',": 518, + "ant": 519, + "Ġat": 520, + "ase": 521, + ".c": 522, + "Ġch": 523, + "": 524, + "ave": 525, + "ang": 526, + "Ġare": 527, + "Ġint": 528, + "âĢĻ": 529, + "_t": 530, + "ert": 531, + "ial": 532, + "act": 533, + "}Ċ": 534, + "ive": 535, + "ode": 536, + "ost": 537, + "Ġclass": 538, + "Ġnot": 539, + "og": 540, + "ord": 541, + "alue": 542, + "all": 543, + "ff": 544, + "();Ċ": 545, + "ont": 546, + "ime": 547, + "are": 548, + "ĠU": 549, + "Ġpr": 550, + "Ġ:": 551, + "ies": 552, + "ize": 553, + "ure": 554, + "Ġby": 555, + "ire": 556, + "Ġ}ĊĊ": 557, + ".p": 558, + "Ġsh": 559, + "ice": 560, + "ast": 561, + "ption": 562, + "tring": 563, + "ok": 564, + "__": 565, + "cl": 566, + "##": 567, + "Ġhe": 568, + "ard": 569, + ").": 570, + "Ġ@": 571, + "iew": 572, + "ĉĉĉ": 573, + "Ġwas": 574, + "ip": 575, + "this": 576, + "Ġu": 577, + "ĠThe": 578, + "ide": 579, + "ace": 580, + "ib": 581, + "ac": 582, + "rou": 583, + "Ġwe": 584, + "ject": 585, + "Ġpublic": 586, + "ak": 587, + "ve": 588, + "ath": 589, + "oid": 590, + "Ġ=>": 591, + "ust": 592, + "que": 593, + "Ġres": 594, + "))": 595, + "'s": 596, + "Ġk": 597, + "ans": 598, + "yst": 599, + "unction": 600, + "********": 601, + "Ġi": 602, + "Ġus": 603, + "pp": 604, + "10": 605, + "one": 606, + "ail": 607, + "====": 608, + "name": 609, + "Ġstr": 610, + "Ġ/": 611, + "Ġ&": 612, + "ach": 613, + "div": 614, + "ystem": 615, + "ell": 616, + "Ġhave": 617, + "err": 618, + "ould": 619, + "ull": 620, + "pon": 621, + "ĠJ": 622, + "_p": 623, + "Ġ==": 624, + "ign": 625, + "St": 626, + ".Ċ": 627, + "Ġpl": 628, + ");ĊĊ": 629, + "form": 630, + "put": 631, + "ount": 632, + "}ĊĊ": 633, + "dd": 634, + "ite": 635, + "Ġget": 636, + "rr": 637, + "ome": 638, + "ĠâĢ": 639, + "aram": 640, + "cc": 641, + "Ġ*/": 642, + "ER": 643, + "In": 644, + "les": 645, + "_s": 646, + "ong": 647, + "ie": 648, + "Ġcan": 649, + "ĠV": 650, + "erv": 651, + "pr": 652, + "Ġun": 653, + "row": 654, + "ber": 655, + "Ġdo": 656, + "ll": 657, + "Ġel": 658, + "Ġself": 659, + "ated": 660, + "ary": 661, + "Ġ.": 662, + "']": 663, + "ud": 664, + "Ġen": 665, + "ĠTh": 666, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 667, + "te": 668, + "_c": 669, + "uct": 670, + "Ġab": 671, + "ork": 672, + ".get": 673, + "Ġ#": 674, + "aw": 675, + "ress": 676, + "ob": 677, + "Name": 678, + "201": 679, + "app": 680, + "['": 681, + "Ġall": 682, + "ory": 683, + "ition": 684, + "ance": 685, + "ear": 686, + "Ġcont": 687, + "vent": 688, + "ia": 689, + "Ġwill": 690, + "IN": 691, + "ĠĠĠĠĠĠĠĠĠ": 692, + "return": 693, + "Ġ": 694, + "data": 695, + ")ĊĊ": 696, + "Re": 697, + "ple": 698, + "ild": 699, + "ther": 700, + "Ġyour": 701, + "\"Ċ": 702, + "($": 703, + "Ġout": 704, + "),": 705, + "Ġhas": 706, + "String": 707, + "so": 708, + "Ġup": 709, + "ax": 710, + "Ġdef": 711, + "Ġbo": 712, + "ge": 713, + "alse": 714, + "ON": 715, + "per": 716, + "12": 717, + "ich": 718, + "Ġbut": 719, + "ĠĊ": 720, + "Ġ_": 721, + "_m": 722, + "add": 723, + "quest": 724, + "odel": 725, + "self": 726, + "ery": 727, + "ft": 728, + "ens": 729, + "////": 730, + "ake": 731, + ".C": 732, + "Ġgo": 733, + "Ġfunction": 734, + "ĠK": 735, + "ivate": 736, + "Ġim": 737, + "Ġconst": 738, + ".t": 739, + "Ġ*/Ċ": 740, + ");čĊ": 741, + "Ġvoid": 742, + "Ġset": 743, + "ĠSystem": 744, + "cri": 745, + "()Ċ": 746, + "li": 747, + "ĉif": 748, + ".m": 749, + "ally": 750, + "set": 751, + "ep": 752, + "âĢĻs": 753, + "bo": 754, + "def": 755, + "',Ċ": 756, + "Ġme": 757, + "Ġ!": 758, + "atch": 759, + "\">": 760, + "\",Ċ": 761, + "ec": 762, + "ĠIn": 763, + "ph": 764, + "Ġ|": 765, + "_f": 766, + "Ġvar": 767, + "ence": 768, + "Id": 769, + "ree": 770, + "ink": 771, + "lect": 772, + "ug": 773, + "eth": 774, + "Ġelse": 775, + "----------------": 776, + "19": 777, + "cont": 778, + "Ġso": 779, + "atic": 780, + "Ġlo": 781, + "pro": 782, + "ton": 783, + "ss": 784, + "own": 785, + "abel": 786, + "oint": 787, + "ous": 788, + "eld": 789, + "ST": 790, + "The": 791, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 792, + "RE": 793, + "\":": 794, + "olor": 795, + "tp": 796, + "eg": 797, + "key": 798, + "ude": 799, + "ĠSt": 800, + "ound": 801, + "Ġar": 802, + "\");Ċ": 803, + "ener": 804, + "ser": 805, + "11": 806, + "bject": 807, + "essage": 808, + "fer": 809, + "Ġmore": 810, + "ations": 811, + "ents": 812, + "Ġhis": 813, + "Ġthey": 814, + ".S": 815, + "ĠY": 816, + "use": 817, + "ne": 818, + "ish": 819, + "old": 820, + "_d": 821, + "io": 822, + "ield": 823, + "Ġper": 824, + "Cont": 825, + "ings": 826, + "####": 827, + "Ġdata": 828, + "Ġsa": 829, + "ef": 830, + "fo": 831, + "Ġone": 832, + "eng": 833, + "Ġdis": 834, + "AT": 835, + "Ġname": 836, + "Ġtrue": 837, + "val": 838, + "led": 839, + ".f": 840, + "Ġne": 841, + "Ġend": 842, + "32": 843, + ".T": 844, + "16": 845, + "cre": 846, + "ark": 847, + "log": 848, + "Ex": 849, + "error": 850, + "_id": 851, + "urre": 852, + "ange": 853, + "Ġnull": 854, + "rray": 855, + "Ġmy": 856, + "pan": 857, + "ict": 858, + "ator": 859, + "View": 860, + "List": 861, + "ĉreturn": 862, + "âĢĿ": 863, + "Ġpre": 864, + "Ġx": 865, + "clude": 866, + "arg": 867, + "15": 868, + "ov": 869, + ".h": 870, + "Ġ>": 871, + "Ġtheir": 872, + "')": 873, + "irst": 874, + "ick": 875, + "gh": 876, + "LE": 877, + "OR": 878, + "Ġprivate": 879, + "tem": 880, + "čĊčĊ": 881, + "user": 882, + "Ġ)": 883, + "com": 884, + ".A": 885, + "\";Ċ": 886, + "Ġid": 887, + "read": 888, + "Ġwho": 889, + "_b": 890, + "\">Ċ": 891, + "Ġtime": 892, + "Ġman": 893, + "ry": 894, + "========": 895, + "roup": 896, + "rop": 897, + "public": 898, + "vel": 899, + "umber": 900, + "ble": 901, + "Ġwhich": 902, + "****************": 903, + "Ġany": 904, + "Ġfalse": 905, + "we": 906, + "Ġvalue": 907, + "Ġli": 908, + "\")": 909, + "nder": 910, + "gr": 911, + "Ġno": 912, + "param": 913, + "25": 914, + "fig": 915, + ".com": 916, + "Ġapp": 917, + "_l": 918, + "ions": 919, + ".D": 920, + "ĠCh": 921, + "Ġabout": 922, + "Ġadd": 923, + "Ġsu": 924, + "Ġstring": 925, + "ID": 926, + "Ġover": 927, + "string": 928, + ".l": 929, + "ource": 930, + "000": 931, + "_C": 932, + "]Ċ": 933, + "Ġqu": 934, + "ĠString": 935, + "ca": 936, + "SE": 937, + "Ġro": 938, + "sh": 939, + "ual": 940, + "Type": 941, + "son": 942, + "new": 943, + "ern": 944, + "Ġag": 945, + "AR": 946, + "];Ċ": 947, + "].": 948, + "Ġ?": 949, + "ical": 950, + "Ġdes": 951, + "uth": 952, + "ix": 953, + "ays": 954, + "Ġtype": 955, + "'t": 956, + "ault": 957, + "Ġinter": 958, + "var": 959, + ".b": 960, + "Ġpart": 961, + ".d": 962, + "urrent": 963, + "IT": 964, + "EN": 965, + "30": 966, + "enc": 967, + "(f": 968, + "ra": 969, + "value": 970, + "cho": 971, + "18": 972, + "utton": 973, + "ose": 974, + "14": 975, + "Ġ!=": 976, + "ater": 977, + "é": 978, + "reate": 979, + "oll": 980, + "pos": 981, + "yle": 982, + "ng": 983, + "AL": 984, + "using": 985, + "ames": 986, + "Ġ{čĊ": 987, + "ates": 988, + "ely": 989, + "Ġwork": 990, + "Ġem": 991, + "inal": 992, + "Ġsp": 993, + "Ġwhen": 994, + ".set": 995, + "ĠĠĠĠĠĠ": 996, + "):Ċ": 997, + "to": 998, + "quire": 999, + "indow": 1000, + "lement": 1001, + "pect": 1002, + "ash": 1003, + "[i": 1004, + "Ġuse": 1005, + ".F": 1006, + "pec": 1007, + "Ġad": 1008, + "ove": 1009, + "ception": 1010, + "ength": 1011, + "include": 1012, + "ader": 1013, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1014, + "atus": 1015, + "Th": 1016, + "itle": 1017, + "rit": 1018, + "void": 1019, + "().": 1020, + "(Ċ": 1021, + "Ġoff": 1022, + "Ġother": 1023, + "Ġ&&": 1024, + "';Ċ": 1025, + "ms": 1026, + "Ġbeen": 1027, + "Ġte": 1028, + "ml": 1029, + "co": 1030, + "nc": 1031, + "13": 1032, + "ervice": 1033, + "Ġ%": 1034, + "**Ċ": 1035, + "ann": 1036, + "ade": 1037, + "ĊĊĊĊ": 1038, + "lock": 1039, + "const": 1040, + "100": 1041, + "ponse": 1042, + "Ġsup": 1043, + "++": 1044, + "date": 1045, + "Ġacc": 1046, + "Ġhad": 1047, + "Ġbu": 1048, + "200": 1049, + "ĠRe": 1050, + "Ġwere": 1051, + "Ġfile": 1052, + "Ġwould": 1053, + "ĠâĢľ": 1054, + "ven": 1055, + "iss": 1056, + "Ġour": 1057, + "class": 1058, + "raw": 1059, + "Ġyear": 1060, + "Data": 1061, + "Ġval": 1062, + "Ġsome": 1063, + "fter": 1064, + "ys": 1065, + "Ġ///": 1066, + "round": 1067, + "view": 1068, + "Ġpe": 1069, + "Ġthere": 1070, + "Ġsaid": 1071, + "du": 1072, + "of": 1073, + "line": 1074, + "/*": 1075, + "duct": 1076, + "Ġher": 1077, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1078, + "Res": 1079, + "Ġco": 1080, + "Ġcomm": 1081, + "ise": 1082, + "min": 1083, + "ĠĠĠĠĊ": 1084, + "#include": 1085, + "ethod": 1086, + ".P": 1087, + "ute": 1088, + "Ġass": 1089, + "Int": 1090, + "ask": 1091, + "loc": 1092, + "Ġlike": 1093, + "ody": 1094, + "Ġlet": 1095, + "load": 1096, + "Ġam": 1097, + "rol": 1098, + "Ġgr": 1099, + "yp": 1100, + "Ġalso": 1101, + "ĠIt": 1102, + "url": 1103, + "ific": 1104, + "ors": 1105, + "_P": 1106, + "_n": 1107, + "igh": 1108, + "Ġthan": 1109, + "Com": 1110, + "AN": 1111, + "UL": 1112, + "ating": 1113, + "17": 1114, + "ĠThis": 1115, + "ref": 1116, + "_S": 1117, + "Ġstatic": 1118, + "roll": 1119, + "Ġjust": 1120, + "Ġresult": 1121, + "ian": 1122, + "idth": 1123, + "Ġthem": 1124, + "));Ċ": 1125, + "der": 1126, + "reak": 1127, + "Con": 1128, + "://": 1129, + "ule": 1130, + "...": 1131, + "arch": 1132, + "ement": 1133, + "Ġ<<": 1134, + "50": 1135, + "ush": 1136, + "ense": 1137, + "arr": 1138, + "Ġinto": 1139, + "cess": 1140, + "amp": 1141, + "ied": 1142, + "ument": 1143, + "Ġ\\": 1144, + "],": 1145, + "wo": 1146, + "als": 1147, + "Ġwhat": 1148, + "anc": 1149, + "Value": 1150, + "='": 1151, + "olum": 1152, + "Ġpos": 1153, + "ages": 1154, + "ayer": 1155, + "Ġsc": 1156, + "ues": 1157, + "\")Ċ": 1158, + "_T": 1159, + "Ġlist": 1160, + "(s": 1161, + "Ġcase": 1162, + "Ch": 1163, + "ĉĉĉĉĉ": 1164, + "////////": 1165, + "ponent": 1166, + "Ġz": 1167, + "Ġkn": 1168, + "let": 1169, + "DE": 1170, + "red": 1171, + "Ġfe": 1172, + "Ġ},Ċ": 1173, + "Ġ,": 1174, + "(t": 1175, + "Ġfirst": 1176, + "');Ċ": 1177, + "word": 1178, + "Ġimport": 1179, + "Ġact": 1180, + "Ġchar": 1181, + "CT": 1182, + "ĠTr": 1183, + "ople": 1184, + "={": 1185, + "ĉf": 1186, + "24": 1187, + "ient": 1188, + "cent": 1189, + ".j": 1190, + "lection": 1191, + "))Ċ": 1192, + "Ġonly": 1193, + "Ġprint": 1194, + "mer": 1195, + ".W": 1196, + "ock": 1197, + "Ġ--": 1198, + "Text": 1199, + "Ġop": 1200, + "ank": 1201, + "Ġits": 1202, + "Ġback": 1203, + "[\"": 1204, + "Ġneed": 1205, + "Ġcl": 1206, + "Ġsub": 1207, + "Ġla": 1208, + "((": 1209, + ".\"": 1210, + "Object": 1211, + "Ġstart": 1212, + "file": 1213, + "(self": 1214, + "ner": 1215, + "ey": 1216, + "Ġuser": 1217, + "Ġent": 1218, + "ĠCom": 1219, + "its": 1220, + "ĠCon": 1221, + "ouble": 1222, + "ower": 1223, + "item": 1224, + "very": 1225, + "ĠWe": 1226, + "64": 1227, + "lick": 1228, + "ĠQ": 1229, + "php": 1230, + "ttp": 1231, + "':": 1232, + "ics": 1233, + "Ġunder": 1234, + "Ġ*Ċ": 1235, + ".L": 1236, + ");": 1237, + "ices": 1238, + "Ġreg": 1239, + ")čĊ": 1240, + "ĉpublic": 1241, + "SS": 1242, + "Ġthen": 1243, + "reat": 1244, + "ious": 1245, + ".G": 1246, + "ek": 1247, + "irect": 1248, + "heck": 1249, + "cript": 1250, + "ning": 1251, + "ĠUn": 1252, + "Ġmay": 1253, + "ĠWh": 1254, + "Bo": 1255, + "Item": 1256, + "struct": 1257, + ".st": 1258, + "ream": 1259, + "ible": 1260, + "loat": 1261, + "Ġorg": 1262, + "und": 1263, + "sum": 1264, + "_in": 1265, + "../": 1266, + "_M": 1267, + "Ġhow": 1268, + "rite": 1269, + "'Ċ": 1270, + "To": 1271, + "40": 1272, + "ww": 1273, + "Ġpeople": 1274, + "index": 1275, + ".n": 1276, + "http": 1277, + "(m": 1278, + "ector": 1279, + "Ġind": 1280, + "Ġjav": 1281, + "],Ċ": 1282, + "ĠHe": 1283, + "_st": 1284, + "ful": 1285, + "ole": 1286, + "){Ċ": 1287, + "Ġshould": 1288, + "opy": 1289, + "elp": 1290, + "ier": 1291, + "_name": 1292, + "erson": 1293, + "ION": 1294, + "ote": 1295, + "Ġtest": 1296, + "Ġbet": 1297, + "rror": 1298, + "ular": 1299, + "ãĢ": 1300, + "ĠÐ": 1301, + "bs": 1302, + "ting": 1303, + "Ġmake": 1304, + "Tr": 1305, + "Ġafter": 1306, + "arget": 1307, + "RO": 1308, + "olumn": 1309, + "rc": 1310, + "_re": 1311, + "define": 1312, + "22": 1313, + "Ġright": 1314, + "right": 1315, + "day": 1316, + "Ġlong": 1317, + "[]": 1318, + "(p": 1319, + "td": 1320, + "cond": 1321, + "ĠPro": 1322, + "Ġrem": 1323, + "ptions": 1324, + "vid": 1325, + ".g": 1326, + "Ġext": 1327, + "Ġ__": 1328, + "')Ċ": 1329, + "pace": 1330, + "mp": 1331, + "Ġmin": 1332, + "stance": 1333, + "air": 1334, + "action": 1335, + "wh": 1336, + "type": 1337, + "util": 1338, + "ait": 1339, + "": 1340, + "IC": 1341, + "text": 1342, + "Ġph": 1343, + "Ġfl": 1344, + ".M": 1345, + "ccess": 1346, + "br": 1347, + "fore": 1348, + "ersion": 1349, + "),Ċ": 1350, + ".re": 1351, + "ateg": 1352, + "Ġloc": 1353, + "ins": 1354, + "-s": 1355, + "trib": 1356, + "ĠInt": 1357, + "Ġarray": 1358, + ",\"": 1359, + "Pro": 1360, + "(c": 1361, + "ession": 1362, + ">ĊĊ": 1363, + "Ġshe": 1364, + "\"]": 1365, + "aph": 1366, + "Ġexp": 1367, + "erty": 1368, + "ĠSe": 1369, + "Ġpar": 1370, + "unc": 1371, + "ET": 1372, + "Ġread": 1373, + "print": 1374, + "Ġrel": 1375, + "Ġform": 1376, + "Ġdr": 1377, + "Exception": 1378, + "input": 1379, + "Ġtrans": 1380, + "########": 1381, + "order": 1382, + "By": 1383, + "Ġaw": 1384, + "ities": 1385, + "uff": 1386, + "play": 1387, + ".add": 1388, + "ĠâĢĵ": 1389, + "Ġwant": 1390, + "Ġcomp": 1391, + "ments": 1392, + "Ġ||": 1393, + "az": 1394, + "be": 1395, + "Ġnumber": 1396, + "Ġrequire": 1397, + "ĠEx": 1398, + "60": 1399, + "Ġcol": 1400, + "Ġkey": 1401, + "ember": 1402, + "Ġtwo": 1403, + "Ġsize": 1404, + "Ġwhere": 1405, + "UT": 1406, + "result": 1407, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1408, + "ough": 1409, + "orld": 1410, + "ood": 1411, + "uch": 1412, + "ative": 1413, + "ger": 1414, + "arent": 1415, + "Ġ/*": 1416, + "Ġarg": 1417, + "Ġwhile": 1418, + "23": 1419, + "(this": 1420, + "Ġrec": 1421, + "Ġdif": 1422, + "State": 1423, + "Ġspec": 1424, + "ride": 1425, + "_F": 1426, + "Ġlook": 1427, + "AM": 1428, + "ility": 1429, + "eter": 1430, + "âĢĻt": 1431, + "ĊĊĊ": 1432, + "ayout": 1433, + "--------------------------------": 1434, + "ager": 1435, + "Ġcould": 1436, + "Ġbr": 1437, + "ends": 1438, + "ures": 1439, + "Ġknow": 1440, + "ets": 1441, + "ĠIf": 1442, + "ĠSh": 1443, + ".w": 1444, + "back": 1445, + "Ġser": 1446, + "Ġ+=": 1447, + "Ġfr": 1448, + "());Ċ": 1449, + "Ġhand": 1450, + "Ind": 1451, + "ULL": 1452, + "Im": 1453, + "();ĊĊ": 1454, + "Ġmost": 1455, + "Ġtry": 1456, + "Ġnow": 1457, + "rough": 1458, + ">čĊ": 1459, + "ackage": 1460, + "Ġhim": 1461, + "._": 1462, + "ify": 1463, + "Ġbreak": 1464, + "Ġ);Ċ": 1465, + "ren": 1466, + "#define": 1467, + "itt": 1468, + "Ġap": 1469, + "ĉc": 1470, + "(n": 1471, + "ĠYou": 1472, + ":ĊĊ": 1473, + "-m": 1474, + "Ġevery": 1475, + "ustom": 1476, + "lient": 1477, + "ocument": 1478, + "cription": 1479, + "Error": 1480, + "-b": 1481, + "о": 1482, + "][": 1483, + "99": 1484, + "trans": 1485, + "Ġpoint": 1486, + "Ġstd": 1487, + "Ġfil": 1488, + "Time": 1489, + "80": 1490, + "Ġmod": 1491, + "Ġ->": 1492, + "Ġerror": 1493, + "ah": 1494, + "Ġtext": 1495, + "roller": 1496, + "lose": 1497, + "ql": 1498, + "Ġpol": 1499, + ">": 1500, + "Ġshow": 1501, + "User": 1502, + "ased": 1503, + "Ġ{ĊĊ": 1504, + "Ġfind": 1505, + "а": 1506, + "ED": 1507, + "span": 1508, + "enu": 1509, + "Ġcurrent": 1510, + "Ġused": 1511, + "cept": 1512, + "clud": 1513, + "Ġplay": 1514, + "Ġlog": 1515, + "ution": 1516, + "fl": 1517, + "Ġsee": 1518, + "indows": 1519, + "Ġhelp": 1520, + "Ġthese": 1521, + "Ġpass": 1522, + "Ġdown": 1523, + "Ġeven": 1524, + "ason": 1525, + "uild": 1526, + "from": 1527, + "(d": 1528, + "Ġbl": 1529, + "label": 1530, + "else": 1531, + "е": 1532, + "Ġ(!": 1533, + "ized": 1534, + "(),": 1535, + "Ġob": 1536, + "Ġitem": 1537, + "ump": 1538, + "UR": 1539, + "orn": 1540, + "Ġdon": 1541, + "Se": 1542, + "man": 1543, + "27": 1544, + "ample": 1545, + "tn": 1546, + "================": 1547, + "He": 1548, + "gram": 1549, + "Ġdid": 1550, + "wn": 1551, + "_h": 1552, + "iver": 1553, + "Ġsm": 1554, + "Ġthrough": 1555, + "ĠAn": 1556, + "che": 1557, + "Ġinv": 1558, + "ouse": 1559, + "Ġes": 1560, + "ĠNew": 1561, + "export": 1562, + "mary": 1563, + "uto": 1564, + "ler": 1565, + "Ġlast": 1566, + "Ġevent": 1567, + "try": 1568, + "ï¼": 1569, + "ily": 1570, + "igned": 1571, + "ines": 1572, + "ollow": 1573, + "icense": 1574, + "sole": 1575, + "lear": 1576, + "(int": 1577, + "Ġagain": 1578, + "Ġhigh": 1579, + "html": 1580, + "Index": 1581, + "uthor": 1582, + "Ġ/**Ċ": 1583, + "Ġline": 1584, + "Event": 1585, + "_D": 1586, + "Ġdoes": 1587, + "itial": 1588, + "Ġcr": 1589, + "ars": 1590, + "28": 1591, + "Ġtem": 1592, + "cause": 1593, + "face": 1594, + "Ġ`": 1595, + "_A": 1596, + "Button": 1597, + "ature": 1598, + "ected": 1599, + "ES": 1600, + "ister": 1601, + "ĉĊ": 1602, + "Ġbefore": 1603, + "ale": 1604, + "other": 1605, + "Ġbecause": 1606, + "roid": 1607, + "Ġed": 1608, + "ik": 1609, + "reg": 1610, + "ĠDe": 1611, + "Ġdist": 1612, + "},Ċ": 1613, + "Ġstate": 1614, + "Ġcons": 1615, + "rint": 1616, + "att": 1617, + "Ġhere": 1618, + "ined": 1619, + "Ġfinal": 1620, + "Ġ\"\"": 1621, + "Key": 1622, + "LO": 1623, + "Ġdel": 1624, + "pty": 1625, + "thing": 1626, + "26": 1627, + "ĠAnd": 1628, + "Ġrun": 1629, + "ĠX": 1630, + "ym": 1631, + ".app": 1632, + "Ġvery": 1633, + "ces": 1634, + "_N": 1635, + "ared": 1636, + "ward": 1637, + "list": 1638, + "ited": 1639, + "olog": 1640, + "itch": 1641, + "Box": 1642, + "ife": 1643, + "33": 1644, + "Ġac": 1645, + "Ġmodel": 1646, + "Ġmon": 1647, + "Ġway": 1648, + "lete": 1649, + "Ġcall": 1650, + "Ġatt": 1651, + "Ġcal": 1652, + "vert": 1653, + "Ġdec": 1654, + "lease": 1655, + "oun": 1656, + "Ġ});Ċ": 1657, + "fr": 1658, + "formation": 1659, + "etail": 1660, + "Ġnum": 1661, + "aj": 1662, + "query": 1663, + "Ġwell": 1664, + "Ġobject": 1665, + "ĠAs": 1666, + "Ġyears": 1667, + "Color": 1668, + "IS": 1669, + "Ġdefault": 1670, + "Wh": 1671, + "Ġins": 1672, + "aint": 1673, + "Ġjava": 1674, + "Ġsim": 1675, + "ĠAr": 1676, + "mon": 1677, + "til": 1678, + "();čĊ": 1679, + "):": 1680, + "Set": 1681, + "29": 1682, + "atter": 1683, + "Ġview": 1684, + "Ġpres": 1685, + "array": 1686, + "We": 1687, + "At": 1688, + "Ġbel": 1689, + "Ġmany": 1690, + "21": 1691, + "Man": 1692, + "ender": 1693, + "Ġbeing": 1694, + "Ġgood": 1695, + "ĉĉĉĉĉĉ": 1696, + "ational": 1697, + "ware": 1698, + ".log": 1699, + "{čĊ": 1700, + "Ġusing": 1701, + "_B": 1702, + "Ġ:=": 1703, + "_w": 1704, + "ists": 1705, + "lish": 1706, + "Ġstud": 1707, + "ĠAl": 1708, + "Ġgu": 1709, + "config": 1710, + "uring": 1711, + "time": 1712, + "oken": 1713, + "amespace": 1714, + "Ġrequest": 1715, + "Ġchild": 1716, + "ĠÃ": 1717, + "lob": 1718, + "Ġparam": 1719, + "Ġ}čĊ": 1720, + "01": 1721, + "Ġecho": 1722, + "function": 1723, + "********************************": 1724, + "ps": 1725, + "Element": 1726, + "alk": 1727, + "lication": 1728, + "by": 1729, + "Size": 1730, + "rawing": 1731, + "Ġperson": 1732, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1733, + "\\n": 1734, + "object": 1735, + "ince": 1736, + "En": 1737, + "File": 1738, + "uf": 1739, + "ffect": 1740, + "AC": 1741, + "Ġstyle": 1742, + "summary": 1743, + "Ġque": 1744, + "_r": 1745, + "Ġ($": 1746, + "Model": 1747, + "ident": 1748, + "Ġmethod": 1749, + "IL": 1750, + "ott": 1751, + "less": 1752, + "ING": 1753, + "Ġ()": 1754, + "Ġexpect": 1755, + "ync": 1756, + "package": 1757, + "35": 1758, + "urs": 1759, + "Ġprot": 1760, + "./": 1761, + "pre": 1762, + "Ġ)Ċ": 1763, + "ma": 1764, + "Ġsur": 1765, + "Ġfound": 1766, + "Info": 1767, + "par": 1768, + "imes": 1769, + ".e": 1770, + "ains": 1771, + "Ġpost": 1772, + "-d": 1773, + "45": 1774, + "olean": 1775, + "Ġsl": 1776, + "PE": 1777, + "Ġsuch": 1778, + "select": 1779, + "ainer": 1780, + "Ġthink": 1781, + "Ġdiffer": 1782, + ".r": 1783, + "/**Ċ": 1784, + "FF": 1785, + "ool": 1786, + "plate": 1787, + "qual": 1788, + "ĠFor": 1789, + "Ġmuch": 1790, + "uc": 1791, + "(new": 1792, + "odule": 1793, + "Ġsom": 1794, + "Ġhttp": 1795, + "ĠList": 1796, + "Ġcount": 1797, + "Ġinst": 1798, + "char": 1799, + "mit": 1800, + ".id": 1801, + "aking": 1802, + "Ġgener": 1803, + "px": 1804, + "vice": 1805, + "37": 1806, + "_data": 1807, + "ĠNULL": 1808, + "}čĊ": 1809, + "idd": 1810, + "ãĢĤ": 1811, + "Ġmed": 1812, + "org": 1813, + "ider": 1814, + "ache": 1815, + "work": 1816, + "Ġcheck": 1817, + "ween": 1818, + "Ġ((": 1819, + "the": 1820, + "ants": 1821, + "><": 1822, + ".B": 1823, + "-c": 1824, + "Ġopen": 1825, + "Ġest": 1826, + "ĠĠĠĠĠĠĠĠĊ": 1827, + "Ġnext": 1828, + "IM": 1829, + "ÑĤ": 1830, + "OT": 1831, + "ó": 1832, + "Ġfollow": 1833, + "content": 1834, + "ĠĠĠĠĠĠĠĠĠĠĠĠ": 1835, + "Ġinclud": 1836, + "HE": 1837, + "ĠRes": 1838, + "Ġhref": 1839, + "и": 1840, + "Ġcar": 1841, + "ypes": 1842, + "image": 1843, + "Un": 1844, + "Ġbool": 1845, + "AD": 1846, + "Ġgame": 1847, + ".Form": 1848, + "rows": 1849, + "*/": 1850, + "velop": 1851, + ".Drawing": 1852, + "Ġpath": 1853, + "ision": 1854, + "Ġeach": 1855, + "ĠPl": 1856, + "_type": 1857, + "Path": 1858, + "nection": 1859, + "Ġav": 1860, + "').": 1861, + "Ġsupport": 1862, + "ENT": 1863, + "rem": 1864, + "\").": 1865, + "Ġown": 1866, + "Ġcor": 1867, + "count": 1868, + "miss": 1869, + "ually": 1870, + "Ġmem": 1871, + "std": 1872, + "ience": 1873, + "search": 1874, + "\"ĊĊ": 1875, + "Form": 1876, + "Ġsex": 1877, + "ename": 1878, + "Ġsign": 1879, + "Ġet": 1880, + "ĠĠĠĠĠĠĠĠĠĠ": 1881, + "','": 1882, + "ĠApp": 1883, + "Ġthose": 1884, + "off": 1885, + "Ġerr": 1886, + "Ġsystem": 1887, + "Ġbest": 1888, + "code": 1889, + "Ġsame": 1890, + "Ġdi": 1891, + "uss": 1892, + "Ġcreate": 1893, + "ather": 1894, + "Array": 1895, + ".in": 1896, + "fe": 1897, + "Service": 1898, + "UN": 1899, + "ats": 1900, + "ĠZ": 1901, + "alth": 1902, + "Ġmade": 1903, + "true": 1904, + "AB": 1905, + "Ġmark": 1906, + "rid": 1907, + "ified": 1908, + ",čĊ": 1909, + "yn": 1910, + "press": 1911, + "Ġgroup": 1912, + "Ġfin": 1913, + "ĠLicense": 1914, + "Field": 1915, + "eger": 1916, + "Ġworld": 1917, + "iness": 1918, + "ty": 1919, + "Ġprocess": 1920, + "(b": 1921, + "Ġcre": 1922, + "arn": 1923, + "ives": 1924, + "Ġmain": 1925, + "ideo": 1926, + "36": 1927, + "_g": 1928, + "AG": 1929, + "valid": 1930, + "img": 1931, + "PI": 1932, + "Ġcolor": 1933, + "Ġreport": 1934, + "Ġtake": 1935, + "rib": 1936, + "OM": 1937, + "Ġday": 1938, + "Request": 1939, + "Ġsk": 1940, + "bers": 1941, + "ĉs": 1942, + ".Add": 1943, + "oot": 1944, + "Image": 1945, + "Ġcomple": 1946, + "ollection": 1947, + "Ġtop": 1948, + "Ġfree": 1949, + "AS": 1950, + "De": 1951, + "ĠOn": 1952, + "IG": 1953, + "90": 1954, + "eta": 1955, + "Date": 1956, + "Ġaction": 1957, + "34": 1958, + "Over": 1959, + "itor": 1960, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1961, + "not": 1962, + "Ġindex": 1963, + "her": 1964, + "icon": 1965, + "On": 1966, + ";čĊčĊ": 1967, + "ivity": 1968, + "mand": 1969, + ".Windows": 1970, + "OL": 1971, + "Ġreal": 1972, + "Ġmax": 1973, + "land": 1974, + "....": 1975, + "raph": 1976, + "Ġbuild": 1977, + "leg": 1978, + "assword": 1979, + "?ĊĊ": 1980, + "â̦": 1981, + "ook": 1982, + "uck": 1983, + "Ġmessage": 1984, + "test": 1985, + "ivers": 1986, + "38": 1987, + "Ġinput": 1988, + "Ġart": 1989, + "Ġbetween": 1990, + "Get": 1991, + "enter": 1992, + "ground": 1993, + "ene": 1994, + "á": 1995, + ".length": 1996, + "Node": 1997, + "(i": 1998, + "Class": 1999, + "for": 2000, + "ĠâĢĶ": 2001, + "ten": 2002, + "oin": 2003, + "Ġke": 2004, + "ui": 2005, + "ĠIN": 2006, + "Ġtable": 2007, + "sub": 2008, + "ĠLe": 2009, + "Ġhead": 2010, + "Ġmust": 2011, + "////////////////": 2012, + ".util": 2013, + "Context": 2014, + "Ġorder": 2015, + "Ġmov": 2016, + "over": 2017, + "Ġcontin": 2018, + "Ġsay": 2019, + "static": 2020, + ".Text": 2021, + "ĠclassName": 2022, + "pany": 2023, + "Ġter": 2024, + "head": 2025, + "rg": 2026, + "Ġproduct": 2027, + "This": 2028, + ".âĢĿ": 2029, + "ĠBut": 2030, + "70": 2031, + "loy": 2032, + "Ġdouble": 2033, + "sg": 2034, + "Ġplace": 2035, + ".x": 2036, + "message": 2037, + "Ġinformation": 2038, + "private": 2039, + "Ġoper": 2040, + "ced": 2041, + "db": 2042, + "\">": 2043, + "Param": 2044, + "icle": 2045, + "Ġweek": 2046, + "Ġprop": 2047, + "table": 2048, + "idget": 2049, + "place": 2050, + "Prop": 2051, + "ĠAll": 2052, + "els": 2053, + "box": 2054, + ".ĊĊĊĊ": 2055, + ".R": 2056, + "ĠTo": 2057, + "iter": 2058, + "Sh": 2059, + "uration": 2060, + "older": 2061, + "_list": 2062, + "come": 2063, + "Ġsw": 2064, + "ization": 2065, + "ĉfor": 2066, + "bl": 2067, + "Ġprogram": 2068, + "(e": 2069, + "ape": 2070, + "check": 2071, + ".Forms": 2072, + "Ġund": 2073, + "ategory": 2074, + "75": 2075, + "ags": 2076, + "Ġresponse": 2077, + "US": 2078, + "request": 2079, + "Ġstruct": 2080, + "escription": 2081, + "Ġcode": 2082, + "_H": 2083, + "uffer": 2084, + "Ġwithout": 2085, + "lobal": 2086, + "Manager": 2087, + "ilter": 2088, + "PO": 2089, + "ĉthis": 2090, + "option": 2091, + "Ġsol": 2092, + "Ġ===": 2093, + "akes": 2094, + "Controller": 2095, + "44": 2096, + "Message": 2097, + "Ġref": 2098, + "ever": 2099, + "ĠSo": 2100, + "aining": 2101, + ".append": 2102, + "Ġstill": 2103, + "Ġprovid": 2104, + "Ġassert": 2105, + "med": 2106, + "Ġcap": 2107, + "usiness": 2108, + "Ġrep": 2109, + "tings": 2110, + "ved": 2111, + ".N": 2112, + "api": 2113, + "OD": 2114, + "Ġfield": 2115, + "iven": 2116, + "oto": 2117, + "âĢľ": 2118, + "col": 2119, + "(x": 2120, + "ght": 2121, + "Result": 2122, + "Code": 2123, + ".is": 2124, + "link": 2125, + "Ġcour": 2126, + "An": 2127, + "Ġteam": 2128, + "ĉint": 2129, + "ift": 2130, + "55": 2131, + "Ġsecond": 2132, + "Ġgoing": 2133, + "Ġrange": 2134, + "_E": 2135, + "ness": 2136, + "39": 2137, + "Ġfam": 2138, + "Ġnil": 2139, + "ĠCont": 2140, + "ailable": 2141, + "utes": 2142, + "atab": 2143, + "Ġfact": 2144, + "Ġvis": 2145, + "(&": 2146, + "ĠAN": 2147, + "31": 2148, + "Al": 2149, + "title": 2150, + "Ġandroid": 2151, + "CE": 2152, + "\\\"": 2153, + "irt": 2154, + "Ġwrit": 2155, + "н": 2156, + "ĉm": 2157, + "ftware": 2158, + "ond": 2159, + "Ġret": 2160, + "osition": 2161, + "Ġhome": 2162, + "Ġleft": 2163, + "args": 2164, + "meric": 2165, + "48": 2166, + "Ġdirect": 2167, + "oci": 2168, + "Pl": 2169, + "As": 2170, + "ret": 2171, + "ado": 2172, + "Of": 2173, + "chn": 2174, + "ĠGet": 2175, + "ee": 2176, + "ross": 2177, + "();": 2178, + "____": 2179, + ".ph": 2180, + "It": 2181, + "oute": 2182, + "Ġexper": 2183, + "chool": 2184, + "www": 2185, + "},": 2186, + "Ġallow": 2187, + "ĠÂ": 2188, + "())": 2189, + "size": 2190, + "ism": 2191, + "ai": 2192, + "tract": 2193, + "ane": 2194, + "...ĊĊ": 2195, + "context": 2196, + "Ġbeg": 2197, + "CH": 2198, + "Ġpage": 2199, + "hip": 2200, + "no": 2201, + "core": 2202, + "sp": 2203, + "Ġdifferent": 2204, + "iable": 2205, + "ĠMe": 2206, + "_IN": 2207, + "button": 2208, + "ĠIs": 2209, + "ervices": 2210, + "Ġca": 2211, + "Ġaround": 2212, + "App": 2213, + "ration": 2214, + "Ġrece": 2215, + "Ġreally": 2216, + "Ġimage": 2217, + "Ġtarget": 2218, + "Ġdep": 2219, + "opyright": 2220, + "tra": 2221, + "ingle": 2222, + "ital": 2223, + "Layout": 2224, + "Ġboth": 2225, + "Override": 2226, + "arm": 2227, + "=>": 2228, + "aterial": 2229, + "iled": 2230, + "Ġput": 2231, + "Qu": 2232, + "ÑĢ": 2233, + "ung": 2234, + "map": 2235, + "ĉĉĉĉĉĉĉĉ": 2236, + "Ġlevel": 2237, + "Component": 2238, + "book": 2239, + "creen": 2240, + "_RE": 2241, + "Ġconfig": 2242, + "ãģ": 2243, + "Or": 2244, + ".data": 2245, + "Ġdocument": 2246, + "\",\"": 2247, + "tribute": 2248, + "ux": 2249, + "Log": 2250, + "ference": 2251, + "post": 2252, + "_e": 2253, + "Ġlocal": 2254, + "andom": 2255, + "assert": 2256, + "Val": 2257, + "lected": 2258, + "ina": 2259, + "atabase": 2260, + "Add": 2261, + "Ġcontent": 2262, + ".print": 2263, + "signed": 2264, + "ric": 2265, + ".\"ĊĊ": 2266, + "Ġfa": 2267, + "!ĊĊ": 2268, + "-f": 2269, + "ived": 2270, + "Ġquest": 2271, + ".ex": 2272, + "Ġfloat": 2273, + "Ġdevelop": 2274, + "оÐ": 2275, + "Map": 2276, + "ading": 2277, + "Ġposs": 2278, + "UE": 2279, + "namespace": 2280, + "_O": 2281, + "ĉb": 2282, + ".Get": 2283, + ">(": 2284, + "json": 2285, + "etails": 2286, + "66": 2287, + "Ġtoo": 2288, + "Ġextends": 2289, + "ĠNone": 2290, + "Ġfore": 2291, + "(String": 2292, + "format": 2293, + "Ġgreat": 2294, + "inter": 2295, + "cale": 2296, + "Ñģ": 2297, + "ron": 2298, + "iving": 2299, + "Ent": 2300, + "ency": 2301, + "xt": 2302, + "oy": 2303, + "05": 2304, + "Ġmonth": 2305, + "Ġhapp": 2306, + "Ġsuper": 2307, + "bar": 2308, + "default": 2309, + "_de": 2310, + "ords": 2311, + "ln": 2312, + "({Ċ": 2313, + "ĠInd": 2314, + "ases": 2315, + "Ġtitle": 2316, + "Ġcontext": 2317, + "08": 2318, + "oh": 2319, + "-p": 2320, + "Em": 2321, + "Ġmet": 2322, + "Test": 2323, + "Ġlife": 2324, + "_v": 2325, + "ĠUS": 2326, + "UI": 2327, + "ocation": 2328, + "md": 2329, + "Ġ[Ċ": 2330, + "Ġ]": 2331, + "sw": 2332, + "Ġincre": 2333, + "script": 2334, + "ential": 2335, + "ways": 2336, + ".de": 2337, + "Ġsrc": 2338, + "Ġcatch": 2339, + "ĠAmeric": 2340, + "//Ċ": 2341, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2342, + "Ġpay": 2343, + "plit": 2344, + "âĢĶ": 2345, + "Ġcoun": 2346, + "obj": 2347, + ".php": 2348, + "Ġchange": 2349, + "ething": 2350, + "'re": 2351, + "aster": 2352, + "los": 2353, + "lation": 2354, + "ĠĠĊ": 2355, + "Le": 2356, + "ä": 2357, + "({": 2358, + "ready": 2359, + "ĠNo": 2360, + "Ġposition": 2361, + "Ġold": 2362, + "Ġbook": 2363, + "abled": 2364, + "bug": 2365, + "202": 2366, + "Hand": 2367, + "};ĊĊ": 2368, + "isplay": 2369, + "aving": 2370, + "04": 2371, + "Ġgover": 2372, + "Ġversion": 2373, + "System": 2374, + "nect": 2375, + "response": 2376, + "Style": 2377, + "Up": 2378, + "angu": 2379, + "Ġthree": 2380, + "init": 2381, + "ero": 2382, + "Ġlaw": 2383, + "endif": 2384, + "Ġbase": 2385, + "email": 2386, + "(l": 2387, + "_V": 2388, + "Ġconf": 2389, + "ATE": 2390, + "Ġduring": 2391, + "tes": 2392, + "Ġconsole": 2393, + "ĠPr": 2394, + "Ġspe": 2395, + "ves": 2396, + "65": 2397, + "path": 2398, + "ialog": 2399, + "dition": 2400, + "_to": 2401, + "ards": 2402, + "Ġagainst": 2403, + "etwork": 2404, + "ĠPh": 2405, + "_L": 2406, + "cur": 2407, + "imit": 2408, + "With": 2409, + "Ġpower": 2410, + "ium": 2411, + "';ĊĊ": 2412, + "Ġwom": 2413, + "left": 2414, + "ources": 2415, + "atri": 2416, + "ĠIm": 2417, + "ĠMan": 2418, + "orth": 2419, + "${": 2420, + "88": 2421, + "quals": 2422, + "ese": 2423, + "_size": 2424, + "Ġiss": 2425, + "otal": 2426, + "-g": 2427, + "ique": 2428, + "rame": 2429, + "Ġwidth": 2430, + "erg": 2431, + ")(": 2432, + "ittle": 2433, + "TR": 2434, + "ĠThey": 2435, + "ences": 2436, + "02": 2437, + "rl": 2438, + "ons": 2439, + "Ġlabel": 2440, + ".y": 2441, + "-t": 2442, + "update": 2443, + "anel": 2444, + "sc": 2445, + ".to": 2446, + "Ġproject": 2447, + "ü": 2448, + "Ġelement": 2449, + "Ġsuccess": 2450, + "ĉĉĊ": 2451, + ".sh": 2452, + "ram": 2453, + "ched": 2454, + "())Ċ": 2455, + "Ġ(Ċ": 2456, + "Ġdate": 2457, + "Ġtot": 2458, + "_ST": 2459, + "All": 2460, + "ification": 2461, + "ĉvar": 2462, + "Ġtri": 2463, + "chem": 2464, + "my": 2465, + "Ġbig": 2466, + "ĠAd": 2467, + "ĠAt": 2468, + "ots": 2469, + "num": 2470, + "Act": 2471, + "Ġmap": 2472, + "era": 2473, + "cope": 2474, + ".$": 2475, + ",âĢĿ": 2476, + "Ġpop": 2477, + "Ġfew": 2478, + "Ġlen": 2479, + "uid": 2480, + "eters": 2481, + "ules": 2482, + "ÃŃ": 2483, + "source": 2484, + "https": 2485, + "Ġdem": 2486, + "Ġear": 2487, + "################": 2488, + "Ġmatch": 2489, + "ories": 2490, + "49": 2491, + "aces": 2492, + "ĠCl": 2493, + "Ġnode": 2494, + "78": 2495, + "irc": 2496, + "local": 2497, + "unity": 2498, + "};Ċ": 2499, + "Ġanother": 2500, + "<<": 2501, + "ogle": 2502, + "Ġsit": 2503, + "ework": 2504, + "TE": 2505, + ".I": 2506, + "NS": 2507, + "ology": 2508, + "ought": 2509, + ".Cont": 2510, + ">>": 2511, + "Ġcare": 2512, + "state": 2513, + "ĉprivate": 2514, + "Ġeffect": 2515, + "++)": 2516, + "_file": 2517, + "ending": 2518, + "Line": 2519, + "For": 2520, + "ior": 2521, + "ĠSc": 2522, + "Ġfun": 2523, + ".Size": 2524, + "ĉelse": 2525, + "])": 2526, + "start": 2527, + "vious": 2528, + "Ġ},": 2529, + "ours": 2530, + "Ġleg": 2531, + "Ġservice": 2532, + "Ġsince": 2533, + "iron": 2534, + "Label": 2535, + "Ġnon": 2536, + "Ġlos": 2537, + "iction": 2538, + "Ġfull": 2539, + "acter": 2540, + "board": 2541, + "gress": 2542, + "Ġturn": 2543, + "ither": 2544, + "09": 2545, + ".size": 2546, + "Ġbody": 2547, + "resh": 2548, + "eturn": 2549, + "199": 2550, + "(_": 2551, + "yles": 2552, + "ormal": 2553, + "pi": 2554, + "Ġsomething": 2555, + "!--": 2556, + "uint": 2557, + "Ġprodu": 2558, + "Ġstand": 2559, + "Ġproble": 2560, + "Ġavailable": 2561, + "mt": 2562, + "ĠBl": 2563, + "Ġ...": 2564, + "Ġblock": 2565, + "Input": 2566, + "Ġkeep": 2567, + "Count": 2568, + "open": 2569, + "Ġ['": 2570, + "Ġthrow": 2571, + "uilder": 2572, + "Action": 2573, + "Ġthings": 2574, + "True": 2575, + "Ġurl": 2576, + "ĠBo": 2577, + "printf": 2578, + "Ġred": 2579, + "js": 2580, + ".create": 2581, + "ĠOr": 2582, + "Status": 2583, + "Instance": 2584, + "Ġcontrol": 2585, + "Ġcome": 2586, + "Ġcustom": 2587, + "location": 2588, + "07": 2589, + "model": 2590, + "ĠčĊ": 2591, + "Ġsource": 2592, + "Ġeas": 2593, + ".out": 2594, + "]ĊĊ": 2595, + "oney": 2596, + "Ġawait": 2597, + "Ġpartic": 2598, + "AP": 2599, + "ublish": 2600, + "odes": 2601, + "_pro": 2602, + "ply": 2603, + "riter": 2604, + "Ġprov": 2605, + "Ġmill": 2606, + "HT": 2607, + "])Ċ": 2608, + "Ġchang": 2609, + "Ġask": 2610, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2611, + "Ġoutput": 2612, + "Ġemail": 2613, + "68": 2614, + ".push": 2615, + "Ġ}čĊčĊ": 2616, + "ination": 2617, + "47": 2618, + "atrix": 2619, + "Table": 2620, + "uccess": 2621, + "]);Ċ": 2622, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2623, + "Ġdisc": 2624, + "([": 2625, + "Ġbusiness": 2626, + "height": 2627, + ".html": 2628, + "ta": 2629, + "field": 2630, + "Ġrequired": 2631, + "_R": 2632, + "Ġgovern": 2633, + "}čĊčĊ": 2634, + "lex": 2635, + "500": 2636, + ".,": 2637, + "ĠSet": 2638, + "urch": 2639, + "///": 2640, + "ts": 2641, + "af": 2642, + "Ġmight": 2643, + "istory": 2644, + "Str": 2645, + "Ġnever": 2646, + "Response": 2647, + "arse": 2648, + "ada": 2649, + "ĠHow": 2650, + "Ġ*)": 2651, + "Ġ;": 2652, + "Ġhard": 2653, + "Ad": 2654, + "Ġintern": 2655, + "used": 2656, + "(data": 2657, + "mod": 2658, + "annel": 2659, + "Ġnp": 2660, + "ugg": 2661, + "Ġ/>Ċ": 2662, + "Ġcalled": 2663, + "body": 2664, + "Ġcho": 2665, + "(r": 2666, + "_set": 2667, + "ird": 2668, + "Ġ>=": 2669, + "Ġ};Ċ": 2670, + "Ġoptions": 2671, + "ĠGener": 2672, + "Ġheight": 2673, + "Point": 2674, + "You": 2675, + "ety": 2676, + "Click": 2677, + "Ġsmall": 2678, + "Ġide": 2679, + "Ġaccess": 2680, + "anguage": 2681, + "Ġprotected": 2682, + "Ġjob": 2683, + "ĠThere": 2684, + "Def": 2685, + "Ġaddress": 2686, + "Ġuint": 2687, + "Not": 2688, + "oo": 2689, + "aps": 2690, + "
&": 5909, + "CON": 5910, + "Ġrepl": 5911, + "Ġregular": 5912, + "Storage": 5913, + "ramework": 5914, + "Ġgoal": 5915, + "Ġtouch": 5916, + ".widget": 5917, + "Ġbuilt": 5918, + "des": 5919, + "Part": 5920, + "(re": 5921, + "Ġworth": 5922, + "hib": 5923, + "game": 5924, + "91": 5925, + "192": 5926, + "Ġв": 5927, + "acion": 5928, + "ĠWhite": 5929, + "(type": 5930, + "(`": 5931, + "81": 5932, + "Ġnatural": 5933, + "Ġinj": 5934, + "Ġcalcul": 5935, + "ĠApril": 5936, + ".List": 5937, + "Ġassociated": 5938, + "ĉSystem": 5939, + "~~": 5940, + "=[": 5941, + "Ġstorage": 5942, + "Ġbytes": 5943, + "Ġtravel": 5944, + "Ġsou": 5945, + "Ġpassed": 5946, + "!=": 5947, + "ascript": 5948, + ".open": 5949, + "Ġgrid": 5950, + "Ġbus": 5951, + "Ġrecogn": 5952, + "Ab": 5953, + "Ġhon": 5954, + "ĠCenter": 5955, + "Ġprec": 5956, + "build": 5957, + "73": 5958, + "HTML": 5959, + "ĠSan": 5960, + "Ġcountries": 5961, + "aled": 5962, + "token": 5963, + "kt": 5964, + "Ġqual": 5965, + "Last": 5966, + "adow": 5967, + "Ġmanufact": 5968, + "idad": 5969, + "jango": 5970, + "Next": 5971, + "xf": 5972, + ".a": 5973, + "Ġporno": 5974, + "ĠPM": 5975, + "erve": 5976, + "iting": 5977, + "_th": 5978, + "ci": 5979, + "=None": 5980, + "gs": 5981, + "Ġlogin": 5982, + "atives": 5983, + "']);Ċ": 5984, + "Äħ": 5985, + "Ġill": 5986, + "IA": 5987, + "children": 5988, + "DO": 5989, + "Ġlevels": 5990, + "Ġ{{": 5991, + "Ġlooks": 5992, + "Ġ\"#": 5993, + "ToString": 5994, + "Ġnecessary": 5995, + "ĠĠĠĊ": 5996, + "cell": 5997, + "Entry": 5998, + "Ġ'#": 5999, + "Ġextrem": 6000, + "Selector": 6001, + "Ġplaceholder": 6002, + "Load": 6003, + "Ġreleased": 6004, + "ORE": 6005, + "Enumer": 6006, + "ĠTV": 6007, + "SET": 6008, + "inq": 6009, + "Press": 6010, + "ĠDepartment": 6011, + "Ġproperties": 6012, + "Ġrespond": 6013, + "Search": 6014, + "ael": 6015, + "Ġrequ": 6016, + "ĠBook": 6017, + "/Ċ": 6018, + "(st": 6019, + "Ġfinancial": 6020, + "icket": 6021, + "_input": 6022, + "Ġthreat": 6023, + "(in": 6024, + "Strip": 6025, + "ìĿ": 6026, + "ção": 6027, + "71": 6028, + "Ġevidence": 6029, + "));": 6030, + "ĠBro": 6031, + "Ġ[];Ċ": 6032, + "Ġou": 6033, + "buf": 6034, + "Script": 6035, + "dat": 6036, + "Ġrule": 6037, + "#import": 6038, + "=\"/": 6039, + "Serial": 6040, + "Ġstarting": 6041, + "[index": 6042, + "ae": 6043, + "Ġcontrib": 6044, + "session": 6045, + "_new": 6046, + "utable": 6047, + "ober": 6048, + "Ġ\"./": 6049, + "Ġlogger": 6050, + "Ġrecently": 6051, + "Ġreturned": 6052, + "ččĊ": 6053, + ")))Ċ": 6054, + "itions": 6055, + "Ġseek": 6056, + "Ġcommunic": 6057, + "Ġ\".": 6058, + "Ġusername": 6059, + "ECT": 6060, + "DS": 6061, + "Ġotherwise": 6062, + "ĠGerman": 6063, + ".aw": 6064, + "Adapter": 6065, + "ixel": 6066, + "Ġsystems": 6067, + "Ġdrop": 6068, + "83": 6069, + "Ġstructure": 6070, + "Ġ$(\"#": 6071, + "encies": 6072, + "anning": 6073, + "ĠLink": 6074, + "ĠResponse": 6075, + "Ġstri": 6076, + "ż": 6077, + "ĠDB": 6078, + "æĹ": 6079, + "android": 6080, + "submit": 6081, + "otion": 6082, + "92": 6083, + "(@": 6084, + ".test": 6085, + "82": 6086, + "ĊĊĊĊĊĊĊĊ": 6087, + "];čĊ": 6088, + "Ġdirectly": 6089, + "Ġ\"%": 6090, + "ris": 6091, + "elta": 6092, + "AIL": 6093, + "){čĊ": 6094, + "mine": 6095, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 6096, + "(k": 6097, + "bon": 6098, + "asic": 6099, + "pite": 6100, + "___": 6101, + "Max": 6102, + "Ġerrors": 6103, + "ĠWhile": 6104, + "Ġarguments": 6105, + "Ġensure": 6106, + "Right": 6107, + "-based": 6108, + "Web": 6109, + "Ġ-=": 6110, + "Ġintrodu": 6111, + "ĠInst": 6112, + "ĠWash": 6113, + "ordin": 6114, + "join": 6115, + "Database": 6116, + "Ġgrad": 6117, + "Ġusually": 6118, + "ITE": 6119, + "Props": 6120, + "?>Ċ": 6121, + "ĠGo": 6122, + "@Override": 6123, + "REF": 6124, + "Ġip": 6125, + "ĠAustral": 6126, + "Ġist": 6127, + "ViewById": 6128, + "Ġserious": 6129, + "Ġcustomer": 6130, + ".prototype": 6131, + "odo": 6132, + "cor": 6133, + "Ġdoor": 6134, + "ĠWITHOUT": 6135, + "Ġplant": 6136, + "Ġbegan": 6137, + "Ġdistance": 6138, + "()).": 6139, + "Ġchance": 6140, + "Ġord": 6141, + "came": 6142, + "pragma": 6143, + "Ġprotect": 6144, + "ragment": 6145, + "ĠNode": 6146, + "ening": 6147, + "Ñĩ": 6148, + "Ġroute": 6149, + "ĠSchool": 6150, + "hi": 6151, + "Ġneighb": 6152, + "After": 6153, + "licit": 6154, + "Ġcontr": 6155, + "Ġprimary": 6156, + "AA": 6157, + ".WriteLine": 6158, + "utils": 6159, + "Ġbi": 6160, + "Red": 6161, + ".Linq": 6162, + ".object": 6163, + "Ġleaders": 6164, + "unities": 6165, + "Ġgun": 6166, + "onth": 6167, + "ĠDev": 6168, + "FILE": 6169, + "Ġcomments": 6170, + "_len": 6171, + "arrow": 6172, + "amount": 6173, + "Range": 6174, + "sert": 6175, + "GridView": 6176, + "Ġupdated": 6177, + "ĠMo": 6178, + "Ġinform": 6179, + "ociety": 6180, + "ala": 6181, + "Access": 6182, + "Ġhab": 6183, + "Ġcreat": 6184, + "_arg": 6185, + "ĠJanuary": 6186, + "ĠDay": 6187, + "\")čĊ": 6188, + "uple": 6189, + "document": 6190, + "gorith": 6191, + "menu": 6192, + "ĠOver": 6193, + "bb": 6194, + ".title": 6195, + "_out": 6196, + "Ġled": 6197, + "uri": 6198, + "Ġ?>": 6199, + "gl": 6200, + "Ġbank": 6201, + "ayment": 6202, + "ĉprintf": 6203, + "MD": 6204, + "Ġsample": 6205, + "Ġhands": 6206, + "ĠVersion": 6207, + "uario": 6208, + "Ġoffers": 6209, + "ityEngine": 6210, + "Ġshape": 6211, + "Ġsleep": 6212, + "_point": 6213, + "Settings": 6214, + "Ġachie": 6215, + "Ġsold": 6216, + "ota": 6217, + ".bind": 6218, + "Am": 6219, + "Ġsafe": 6220, + "Store": 6221, + "Ġshared": 6222, + "Ġpriv": 6223, + "_VAL": 6224, + "Ġsens": 6225, + "){": 6226, + "Ġremember": 6227, + "shared": 6228, + "element": 6229, + "Ġshoot": 6230, + "Vert": 6231, + "cout": 6232, + "Ġenv": 6233, + "_label": 6234, + "Ġ>Ċ": 6235, + "run": 6236, + "Ġscene": 6237, + "(array": 6238, + "device": 6239, + "_title": 6240, + "agon": 6241, + "]čĊ": 6242, + "aby": 6243, + "Ġbecame": 6244, + "boolean": 6245, + "Ġpark": 6246, + "ĠCode": 6247, + "upload": 6248, + "riday": 6249, + "ĠSeptember": 6250, + "Fe": 6251, + "Ġsen": 6252, + "cing": 6253, + "FL": 6254, + "Col": 6255, + "uts": 6256, + "_page": 6257, + "inn": 6258, + "Ġimplied": 6259, + "aling": 6260, + "Ġyourself": 6261, + ".Count": 6262, + "conf": 6263, + "Ġaud": 6264, + "_init": 6265, + ".)": 6266, + "Ġwrote": 6267, + "003": 6268, + "NG": 6269, + ".Error": 6270, + "ä»": 6271, + ".for": 6272, + "Ġequal": 6273, + "ĠRequest": 6274, + "Ġserial": 6275, + "Ġallows": 6276, + "XX": 6277, + "Ġmiddle": 6278, + "chor": 6279, + "195": 6280, + "94": 6281, + "ø": 6282, + "erval": 6283, + ".Column": 6284, + "reading": 6285, + "Ġescort": 6286, + "ĠAugust": 6287, + "Ġquickly": 6288, + "Ġweap": 6289, + "ĠCG": 6290, + "ropri": 6291, + "ho": 6292, + "Ġcop": 6293, + "(struct": 6294, + "ĠBig": 6295, + "Ġvs": 6296, + "Ġfrequ": 6297, + ".Value": 6298, + "Ġactions": 6299, + "Ġproper": 6300, + "Ġinn": 6301, + "Ġobjects": 6302, + "Ġmatrix": 6303, + "avascript": 6304, + "Ġones": 6305, + ".group": 6306, + "Ġgreen": 6307, + "Ġpaint": 6308, + "ools": 6309, + "ycl": 6310, + "encode": 6311, + "olt": 6312, + "comment": 6313, + ".api": 6314, + "Dir": 6315, + "Ġune": 6316, + "izont": 6317, + ".position": 6318, + "Ġdesigned": 6319, + "_val": 6320, + "avi": 6321, + "iring": 6322, + "tab": 6323, + "Ġlayer": 6324, + "Ġviews": 6325, + "Ġreve": 6326, + "rael": 6327, + "ĠON": 6328, + "rics": 6329, + "160": 6330, + "np": 6331, + "Ġcore": 6332, + "());čĊ": 6333, + "Main": 6334, + "Ġexpert": 6335, + "ĉĉčĊ": 6336, + "_en": 6337, + "Ġ/>": 6338, + "utter": 6339, + "IAL": 6340, + "ails": 6341, + "ĠKing": 6342, + "*/ĊĊ": 6343, + "ĠMet": 6344, + "_end": 6345, + "addr": 6346, + "ora": 6347, + "Ġir": 6348, + "Min": 6349, + "Ġsurpr": 6350, + "Ġrepe": 6351, + "Ġdirectory": 6352, + "PUT": 6353, + "-S": 6354, + "Ġelection": 6355, + "haps": 6356, + ".pre": 6357, + "cm": 6358, + "Values": 6359, + "Ġ\"Ċ": 6360, + "column": 6361, + "ivil": 6362, + "Login": 6363, + "inue": 6364, + "93": 6365, + "Ġbeautiful": 6366, + "Ġsecret": 6367, + "(event": 6368, + "Ġchat": 6369, + "ums": 6370, + "Ġorigin": 6371, + "Ġeffects": 6372, + "Ġmanagement": 6373, + "illa": 6374, + "tk": 6375, + "Ġsetting": 6376, + "ĠCour": 6377, + "Ġmassage": 6378, + "ĉend": 6379, + "Ġhappy": 6380, + "Ġfinish": 6381, + "Ġcamera": 6382, + "ĠVer": 6383, + "ĠDemocr": 6384, + "ĠHer": 6385, + "(Q": 6386, + "cons": 6387, + "ita": 6388, + "Ġ'.": 6389, + "{}": 6390, + "ĉC": 6391, + "Ġstuff": 6392, + "194": 6393, + "Ġ:Ċ": 6394, + "ĠAR": 6395, + "Task": 6396, + "hidden": 6397, + "eros": 6398, + "IGN": 6399, + "atio": 6400, + "ĠHealth": 6401, + "olute": 6402, + "Enter": 6403, + "'>": 6404, + "ĠTwitter": 6405, + "ĠCounty": 6406, + "scribe": 6407, + "Ġ=>Ċ": 6408, + "Ġhy": 6409, + "fit": 6410, + "Ġmilitary": 6411, + "Ġsale": 6412, + "required": 6413, + "non": 6414, + "bootstrap": 6415, + "hold": 6416, + "rim": 6417, + "-old": 6418, + "ĠDown": 6419, + "Ġmention": 6420, + "contact": 6421, + "_group": 6422, + "oday": 6423, + "Ġtown": 6424, + "Ġsolution": 6425, + "uate": 6426, + "elling": 6427, + "]->": 6428, + "otes": 6429, + "ental": 6430, + "omen": 6431, + "ospital": 6432, + "ĠSup": 6433, + "_EN": 6434, + "Ġslow": 6435, + "SESSION": 6436, + "Ġblue": 6437, + "ago": 6438, + "Ġlives": 6439, + "Ġ^": 6440, + ".un": 6441, + "inst": 6442, + "enge": 6443, + "Ġcustomers": 6444, + "Ġcast": 6445, + "udget": 6446, + "ï¼ģ": 6447, + "icens": 6448, + "Ġdetermin": 6449, + "Selected": 6450, + "_pl": 6451, + "ueue": 6452, + "Ġdark": 6453, + "//ĊĊ": 6454, + "si": 6455, + "thern": 6456, + "ĠJapan": 6457, + "/w": 6458, + "PU": 6459, + "ĠEast": 6460, + "ovie": 6461, + "Ġpackage": 6462, + "Ġnor": 6463, + "Ġapi": 6464, + "bot": 6465, + "\"];Ċ": 6466, + "_post": 6467, + "ulate": 6468, + "Ġclub": 6469, + "'));Ċ": 6470, + "Ġloop": 6471, + "PIO": 6472, + "ione": 6473, + "shot": 6474, + "Initial": 6475, + "Ġplayed": 6476, + "register": 6477, + "rought": 6478, + "_max": 6479, + "acement": 6480, + "match": 6481, + "raphics": 6482, + "AST": 6483, + "Ġexisting": 6484, + "Ġcomplex": 6485, + "DA": 6486, + ".Ch": 6487, + ".common": 6488, + "mo": 6489, + "Ġ'../../": 6490, + "ito": 6491, + "Ġanalysis": 6492, + "Ġdeliver": 6493, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 6494, + "idx": 6495, + "Ãł": 6496, + "ongo": 6497, + "ĠEnglish": 6498, + "Ċ": 10197, + "_default": 10198, + "ĠDatabase": 10199, + "rep": 10200, + "ESS": 10201, + "nergy": 10202, + ".Find": 10203, + "_mask": 10204, + "Ġrise": 10205, + "Ġkernel": 10206, + "::$": 10207, + ".Q": 10208, + "Ġoffering": 10209, + "decl": 10210, + "ĠCS": 10211, + "Ġlisted": 10212, + "Ġmostly": 10213, + "enger": 10214, + "Ġblocks": 10215, + "olo": 10216, + "Ġgoverning": 10217, + "\\F": 10218, + "Ġconcent": 10219, + ".getText": 10220, + "Ġmb": 10221, + "Ġoccurred": 10222, + "Ġchanging": 10223, + "Scene": 10224, + "_CODE": 10225, + "Beh": 10226, + "\"The": 10227, + "Ġtile": 10228, + "ĠAssociation": 10229, + "ĉP": 10230, + "alty": 10231, + "_ad": 10232, + "odies": 10233, + "iated": 10234, + "Ġprepared": 10235, + "possible": 10236, + "Ġmort": 10237, + "TEST": 10238, + "142": 10239, + "Ġignore": 10240, + "Ġcalc": 10241, + "Ġrs": 10242, + "ĠassertEquals": 10243, + "Ġsz": 10244, + "ĠTHIS": 10245, + ".\"Ċ": 10246, + "Ġcanvas": 10247, + "java": 10248, + "Ġdut": 10249, + "VALID": 10250, + ".sql": 10251, + ".input": 10252, + "Ġaux": 10253, + "Sup": 10254, + "Ġartist": 10255, + "Vec": 10256, + "_TIME": 10257, + ".stringify": 10258, + "etween": 10259, + "ĠCategory": 10260, + "Ġ[-": 10261, + "ĠDevExpress": 10262, + "ĠJul": 10263, + "Ġring": 10264, + ".ed": 10265, + "YY": 10266, + "Let": 10267, + "TextField": 10268, + "Ġflat": 10269, + "_print": 10270, + "ĠOTHER": 10271, + "adian": 10272, + "Ġchecked": 10273, + "ele": 10274, + "Align": 10275, + "standing": 10276, + "Ġ[],": 10277, + "Ġlab": 10278, + "ucky": 10279, + "ĠChristmas": 10280, + "(image": 10281, + ".module": 10282, + "Ġlots": 10283, + "Ġslightly": 10284, + "(final": 10285, + "erge": 10286, + "è¿": 10287, + "147": 10288, + "ĠPolice": 10289, + "143": 10290, + "ĠRight": 10291, + "Ġaward": 10292, + "ĠOS": 10293, + "Ġ{}ĊĊ": 10294, + "Ġptr": 10295, + "oves": 10296, + "icated": 10297, + "ем": 10298, + "Ġmanage": 10299, + "oliday": 10300, + "Amount": 10301, + "oolStrip": 10302, + "tbody": 10303, + "Nav": 10304, + "wrap": 10305, + "BB": 10306, + "Ġwatching": 10307, + "arios": 10308, + "Ġoptional": 10309, + "_K": 10310, + "ĠLicensed": 10311, + ".Map": 10312, + "Timer": 10313, + "ĠAP": 10314, + "ĠRev": 10315, + "(o": 10316, + ",c": 10317, + "umin": 10318, + "etailed": 10319, + "ĠHy": 10320, + "Ġblank": 10321, + "agger": 10322, + "ĠSelf": 10323, + "()[": 10324, + ".make": 10325, + "earn": 10326, + "channel": 10327, + ";Ċ": 10342, + "World": 10343, + "Ġpython": 10344, + "Ġlif": 10345, + "Ġtrav": 10346, + "Ġconven": 10347, + "company": 10348, + "ĠClub": 10349, + "138": 10350, + "Ver": 10351, + "Btn": 10352, + "Ġzone": 10353, + "products": 10354, + "ĠEduc": 10355, + "Ġverify": 10356, + "ĠMil": 10357, + "ono": 10358, + "]);ĊĊ": 10359, + "ENCE": 10360, + "Ġpacket": 10361, + "Ġcer": 10362, + "Ġenumer": 10363, + "Ġpars": 10364, + "formed": 10365, + "Ġoccup": 10366, + "tre": 10367, + "Ġexercise": 10368, + "Day": 10369, + "_sum": 10370, + "Ġasking": 10371, + "aption": 10372, + "Ġorders": 10373, + "Ġspending": 10374, + "ĠERR": 10375, + ".Dis": 10376, + "ĠUtil": 10377, + "âĢľI": 10378, + "\\'": 10379, + "?)": 10380, + "/>Ċ": 10381, + "Ġemot": 10382, + "Ġinfluence": 10383, + "ĠAfrica": 10384, + "atters": 10385, + "Ùħ": 10386, + ".session": 10387, + "Ġchief": 10388, + "ĉĉĉĉĉĉĉĉĉĉĉ": 10389, + "Ġtom": 10390, + "cluded": 10391, + "serial": 10392, + "_handler": 10393, + ".Type": 10394, + "aped": 10395, + "Ġpolicies": 10396, + "-ex": 10397, + "-tr": 10398, + "blank": 10399, + "merce": 10400, + "Ġcoverage": 10401, + "Ġrc": 10402, + "_matrix": 10403, + "_box": 10404, + "Ġcharges": 10405, + "ĠBoston": 10406, + "Pe": 10407, + "Ġcircum": 10408, + "Ġfilled": 10409, + "148": 10410, + "Ġnorth": 10411, + "ictureBox": 10412, + "ĉres": 10413, + "è®": 10414, + "Ġtermin": 10415, + "Ġ[â̦": 10416, + "IRECT": 10417, + "Ġber": 10418, + "Ġ\"../../": 10419, + "retch": 10420, + ".code": 10421, + "_col": 10422, + "ĠGovernment": 10423, + "Ġargv": 10424, + "ĠLord": 10425, + "asi": 10426, + "Exec": 10427, + "ĉlet": 10428, + "vertis": 10429, + "Ġdiscussion": 10430, + "enance": 10431, + "outube": 10432, + "typeof": 10433, + "Ġserved": 10434, + "ĠPut": 10435, + "ĉx": 10436, + "Ġsweet": 10437, + "Before": 10438, + "ategy": 10439, + ".of": 10440, + "ĠMaterial": 10441, + "Sort": 10442, + "ONT": 10443, + "igital": 10444, + "Why": 10445, + "Ġsust": 10446, + "Ġç": 10447, + "abet": 10448, + "Ġsegment": 10449, + "Ġ[],Ċ": 10450, + "ĠMuslim": 10451, + "ĠfindViewById": 10452, + "cut": 10453, + "_TEXT": 10454, + "ĠMary": 10455, + "Ġloved": 10456, + "Ġlie": 10457, + "ĠJO": 10458, + "Ġisset": 10459, + "month": 10460, + "Ġprime": 10461, + "ti": 10462, + "ĠCarol": 10463, + "Use": 10464, + "146": 10465, + "ĠPop": 10466, + "ĠSave": 10467, + "Interval": 10468, + "execute": 10469, + "dy": 10470, + "ĠIran": 10471, + "_cont": 10472, + "ĉT": 10473, + "Ġphase": 10474, + "checkbox": 10475, + "week": 10476, + "Ġhide": 10477, + "Ġtil": 10478, + "Ġju": 10479, + "Custom": 10480, + "burg": 10481, + "/M": 10482, + "TON": 10483, + "Ġquant": 10484, + "Ġrub": 10485, + "ixels": 10486, + "Ġinstalled": 10487, + "Ġdump": 10488, + "Ġproperly": 10489, + "(List": 10490, + "Ġdecide": 10491, + "apply": 10492, + "Has": 10493, + "Ġkeeping": 10494, + "Ġcitizens": 10495, + "Ġjoint": 10496, + "pool": 10497, + "Socket": 10498, + "_op": 10499, + "Ġweapon": 10500, + "gnore": 10501, + "ĠExec": 10502, + "otten": 10503, + "ĠMS": 10504, + "Ġ(-": 10505, + "ĠReview": 10506, + "Ġexamples": 10507, + "Ġtight": 10508, + "!(": 10509, + "DP": 10510, + "ĠMessageBox": 10511, + "Ġphotograph": 10512, + "164": 10513, + "URI": 10514, + "ét": 10515, + "low": 10516, + "ĠGrand": 10517, + ".persistence": 10518, + "Ġmaintain": 10519, + "Ġnums": 10520, + "Ġzip": 10521, + "ials": 10522, + "ĠGets": 10523, + "peg": 10524, + "ĠBuffer": 10525, + "~~~~": 10526, + "rastructure": 10527, + "ĠPL": 10528, + "uen": 10529, + "obby": 10530, + "sizeof": 10531, + "Ġpic": 10532, + "Ġseed": 10533, + "Ġexperienced": 10534, + "Ġodd": 10535, + "Ġkick": 10536, + "Ġprocedure": 10537, + "avigator": 10538, + "-on": 10539, + ",j": 10540, + "ĠAlthough": 10541, + "ĠuserId": 10542, + "accept": 10543, + "Blue": 10544, + "IColor": 10545, + "layer": 10546, + "available": 10547, + "Ġends": 10548, + ".table": 10549, + "Ġdataset": 10550, + "bus": 10551, + "Ġexplain": 10552, + "(pro": 10553, + "ĠCommittee": 10554, + "Ġnoted": 10555, + "]:Ċ": 10556, + "Dim": 10557, + "stdio": 10558, + "154": 10559, + ".\",Ċ": 10560, + "_source": 10561, + "181": 10562, + "ĠWeek": 10563, + "ĠEdge": 10564, + "Ġoperating": 10565, + "Ġeste": 10566, + "ipl": 10567, + "330": 10568, + "agination": 10569, + "Ġproceed": 10570, + "Ġanimation": 10571, + ".Models": 10572, + "ĠWatch": 10573, + "iat": 10574, + "Ġoppon": 10575, + "/A": 10576, + "Report": 10577, + "Ġsounds": 10578, + "_buf": 10579, + "IELD": 10580, + "Ġbund": 10581, + "ĉget": 10582, + ".pr": 10583, + "(tmp": 10584, + "Ġkid": 10585, + ">ĊĊĊ": 10586, + "Ġyang": 10587, + "NotFound": 10588, + "ÑĨ": 10589, + "math": 10590, + "@gmail": 10591, + "ĠLIMIT": 10592, + "redients": 10593, + "Ġvent": 10594, + "avigate": 10595, + "Look": 10596, + "Ġreligious": 10597, + "Ġrand": 10598, + "rio": 10599, + "(GL": 10600, + "_ip": 10601, + "uan": 10602, + "iciency": 10603, + "ĠChange": 10604, + ">čĊčĊ": 10605, + "ĠEntity": 10606, + "Ġrencontre": 10607, + "ĠRet": 10608, + "plan": 10609, + "én": 10610, + "BOOL": 10611, + "uries": 10612, + "train": 10613, + "Definition": 10614, + "============": 10615, + "zz": 10616, + "450": 10617, + "Animation": 10618, + "ĠOK": 10619, + "_menu": 10620, + ".bl": 10621, + "_score": 10622, + "Ġacad": 10623, + "(System": 10624, + "Ġrefresh": 10625, + "'=>$": 10626, + ".Graphics": 10627, + "amento": 10628, + "pid": 10629, + "tc": 10630, + "Ġtips": 10631, + "Ġhomes": 10632, + "Ġfuel": 10633, + "âĸ": 10634, + "_helper": 10635, + "ĠĠčĊ": 10636, + "ĠRoom": 10637, + ".Close": 10638, + "_attr": 10639, + "ĠMount": 10640, + "ĠEv": 10641, + "arser": 10642, + "_top": 10643, + "eah": 10644, + "ĠDelete": 10645, + "ãĢį": 10646, + "uke": 10647, + "Ġusage": 10648, + "aria": 10649, + "_dev": 10650, + "Ġtexture": 10651, + "Ġconversation": 10652, + "eper": 10653, + "Bean": 10654, + "done": 10655, + "nonatomic": 10656, + "ĠSecond": 10657, + "Ġshooting": 10658, + "_pre": 10659, + "Components": 10660, + "Ġ]ĊĊ": 10661, + "__,": 10662, + "stitution": 10663, + ".Char": 10664, + ">();ĊĊ": 10665, + "Ġpresented": 10666, + "Ġwa": 10667, + "oker": 10668, + "-ĊĊ": 10669, + "iner": 10670, + "Ġbecoming": 10671, + "Ġincident": 10672, + "Att": 10673, + "162": 10674, + "Ġrevealed": 10675, + "forc": 10676, + "Ġboot": 10677, + ".page": 10678, + "Enumerator": 10679, + "165": 10680, + "_->": 10681, + "Photo": 10682, + "Ġspring": 10683, + ".\",": 10684, + "ĠDictionary": 10685, + "BJECT": 10686, + "Ġlocations": 10687, + "Ġsamples": 10688, + "InputStream": 10689, + "ĠBrown": 10690, + "Ġstats": 10691, + "quality": 10692, + "Ñħ": 10693, + "-dis": 10694, + "Ġhelping": 10695, + "Ġped": 10696, + "224": 10697, + "(se": 10698, + "ĠWho": 10699, + "alian": 10700, + "internal": 10701, + "Ġft": 10702, + ">().": 10703, + "->{": 10704, + "Ġmine": 10705, + "Ġsector": 10706, + "Ġgro": 10707, + "Ġopportunities": 10708, + "Ġü": 10709, + "Ġmp": 10710, + "Ġalleged": 10711, + "Ġdoubt": 10712, + "Mouse": 10713, + "About": 10714, + "_part": 10715, + "Ġchair": 10716, + "Ġstopped": 10717, + "161": 10718, + "loop": 10719, + "entities": 10720, + "Ġapps": 10721, + "ansion": 10722, + "Ġmental": 10723, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10724, + "FR": 10725, + "Ġdefend": 10726, + "care": 10727, + "Ġideal": 10728, + "/api": 10729, + "urface": 10730, + "011": 10731, + "Ġele": 10732, + "ulator": 10733, + "ĠRights": 10734, + "anguages": 10735, + "Ġfunds": 10736, + "Ġadapt": 10737, + "Attributes": 10738, + "Ġdeploy": 10739, + "opts": 10740, + "Ġvalidation": 10741, + "Ġconcerns": 10742, + "uce": 10743, + ".num": 10744, + "ulture": 10745, + "ila": 10746, + "Ġcup": 10747, + "Ġpure": 10748, + ".Fore": 10749, + "183": 10750, + "ĠHashMap": 10751, + ".valueOf": 10752, + "asm": 10753, + "MO": 10754, + "Ġcs": 10755, + "Ġstores": 10756, + "Ġ************************************************************************": 10757, + "Ġcommunication": 10758, + "mem": 10759, + ".EventHandler": 10760, + ".Status": 10761, + "_right": 10762, + ".setOn": 10763, + "Sheet": 10764, + "Ġidentify": 10765, + "enerated": 10766, + "ordered": 10767, + "Ġ\"[": 10768, + "Ġswe": 10769, + "Condition": 10770, + "ĠAccording": 10771, + "Ġprepare": 10772, + "Ġrob": 10773, + "Pool": 10774, + "Ġsport": 10775, + "rv": 10776, + "ĠRouter": 10777, + "Ġalternative": 10778, + "([]": 10779, + "ĠChicago": 10780, + "ipher": 10781, + "ische": 10782, + "ĠDirector": 10783, + "kl": 10784, + "ĠWil": 10785, + "keys": 10786, + "Ġmysql": 10787, + "Ġwelcome": 10788, + "king": 10789, + "ĠManager": 10790, + "Ġcaught": 10791, + ")}Ċ": 10792, + "Score": 10793, + "_PR": 10794, + "Ġsurvey": 10795, + "hab": 10796, + "Headers": 10797, + "ADER": 10798, + "Ġdecor": 10799, + "Ġturns": 10800, + "Ġradius": 10801, + "errupt": 10802, + "Cor": 10803, + "Ġmel": 10804, + "Ġintr": 10805, + "(q": 10806, + "ĠAC": 10807, + "amos": 10808, + "MAX": 10809, + "ĠGrid": 10810, + "ĠJesus": 10811, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10812, + ".DE": 10813, + "Ġts": 10814, + "Ġlinked": 10815, + "free": 10816, + "ĠQt": 10817, + "Ġ/**čĊ": 10818, + "Ġfaster": 10819, + "ctr": 10820, + "_J": 10821, + "DT": 10822, + ".Check": 10823, + "Ġcombination": 10824, + "Ġintended": 10825, + "-the": 10826, + "-type": 10827, + "182": 10828, + "ectors": 10829, + "ami": 10830, + "uting": 10831, + "Ġuma": 10832, + "XML": 10833, + "UCT": 10834, + "Ap": 10835, + "ĠRandom": 10836, + "Ġran": 10837, + ".sort": 10838, + "Ġsorted": 10839, + ".Un": 10840, + "401": 10841, + "_PER": 10842, + "itory": 10843, + "Ġpriority": 10844, + "ĠGal": 10845, + "ĠOld": 10846, + "hot": 10847, + "ĠDisplay": 10848, + "(sub": 10849, + "_TH": 10850, + "_Y": 10851, + "ĠCare": 10852, + "loading": 10853, + "Kind": 10854, + "_handle": 10855, + ",,": 10856, + "rase": 10857, + "_replace": 10858, + ".addEventListener": 10859, + "ĠRT": 10860, + "172": 10861, + "Ġentered": 10862, + "gers": 10863, + "Ġich": 10864, + "(start": 10865, + "205": 10866, + "/app": 10867, + "Ġbrother": 10868, + "Memory": 10869, + "Outlet": 10870, + "Ġutf": 10871, + "prec": 10872, + "Ġnavigation": 10873, + "ORK": 10874, + "Ġdst": 10875, + "Detail": 10876, + "Ġaudience": 10877, + "Ġdur": 10878, + "Ġcluster": 10879, + "unched": 10880, + "Ġ],": 10881, + "Ġcomfortable": 10882, + ".values": 10883, + "ĠTotal": 10884, + "Ġsnap": 10885, + "Ġstandards": 10886, + "Ġperformed": 10887, + "hand": 10888, + "(\"@": 10889, + "åŃ": 10890, + "Ġphil": 10891, + "ibr": 10892, + "trim": 10893, + "Ġforget": 10894, + "157": 10895, + "Ġdoctor": 10896, + ".TextBox": 10897, + "377": 10898, + "icons": 10899, + ",s": 10900, + "ĠOp": 10901, + "Sm": 10902, + "Stop": 10903, + "ĉList": 10904, + "ĉu": 10905, + "Comment": 10906, + "_VERSION": 10907, + ".Xtra": 10908, + "Person": 10909, + "rb": 10910, + "LOB": 10911, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 10912, + "ĠCentral": 10913, + "270": 10914, + "ICK": 10915, + "raq": 10916, + "Ġputting": 10917, + "Ġmd": 10918, + "ĠLove": 10919, + "Program": 10920, + "Border": 10921, + "oor": 10922, + "Ġallowing": 10923, + "after": 10924, + "Ġentries": 10925, + "ĠMaybe": 10926, + "]).": 10927, + "ĠShort": 10928, + ")\\": 10929, + ".now": 10930, + "friend": 10931, + "Ġprefer": 10932, + "ĠGPIO": 10933, + "osis": 10934, + "ĠGameObject": 10935, + "Ġskip": 10936, + "Ġcompetition": 10937, + "_match": 10938, + "lications": 10939, + "_CONT": 10940, + ".groupBox": 10941, + "Ġals": 10942, + "666": 10943, + "\"We": 10944, + "_eq": 10945, + "lan": 10946, + "_search": 10947, + "ĠMusic": 10948, + "asis": 10949, + "Ġbind": 10950, + "ĠIsland": 10951, + "rum": 10952, + "(E": 10953, + "Ġseat": 10954, + "Video": 10955, + "Ġack": 10956, + "reek": 10957, + "={()": 10958, + "Ġrating": 10959, + "Ġrestaurant": 10960, + "456": 10961, + "DEX": 10962, + "(buf": 10963, + "pping": 10964, + "uality": 10965, + "Ġleague": 10966, + "176": 10967, + "Ġfocused": 10968, + "apon": 10969, + "$data": 10970, + "CLUD": 10971, + "CLUDING": 10972, + "Ġabsolute": 10973, + "(query": 10974, + "Ġtells": 10975, + "Ang": 10976, + "Ġcommunities": 10977, + "Ġhonest": 10978, + "oking": 10979, + "Ġapart": 10980, + "arity": 10981, + "/$": 10982, + "_module": 10983, + "ĠEnc": 10984, + ".an": 10985, + ".Config": 10986, + "Cre": 10987, + "Ġshock": 10988, + "ĠArab": 10989, + "IENT": 10990, + "/re": 10991, + "Ġretrie": 10992, + "ycler": 10993, + "isa": 10994, + "ĠOrgan": 10995, + ".graph": 10996, + "Ġí": 10997, + "ĠBAS": 10998, + "Enum": 10999, + "Ġpossibly": 11000, + "ÑĢаÐ": 11001, + "ĠJapanese": 11002, + "Ġcraft": 11003, + "ĠPlace": 11004, + "Ġtalent": 11005, + "Ġfunding": 11006, + "Ġconfirmed": 11007, + "Ġcycle": 11008, + "/x": 11009, + "GE": 11010, + "Ġhearing": 11011, + "Ġplants": 11012, + "Ġmouth": 11013, + "pages": 11014, + "oria": 11015, + "ĠRemove": 11016, + "_total": 11017, + "Ġod": 11018, + "ollapse": 11019, + "door": 11020, + "Ġbought": 11021, + "Ġaddr": 11022, + "ARCH": 11023, + "_dim": 11024, + "dden": 11025, + "Ġdecades": 11026, + "REQUEST": 11027, + "Ġversions": 11028, + "fire": 11029, + "006": 11030, + "Ġmoves": 11031, + "fb": 11032, + "Ġcoffee": 11033, + ".connect": 11034, + "ĠRow": 11035, + "Ġschema": 11036, + "Scope": 11037, + "-Type": 11038, + "Ġfighting": 11039, + "Ġretail": 11040, + "Ġmodified": 11041, + "TF": 11042, + "Files": 11043, + "nie": 11044, + "_command": 11045, + "stone": 11046, + "ĠÑĤ": 11047, + "_thread": 11048, + "Ġbond": 11049, + "ĠDevelopment": 11050, + "Ġpt": 11051, + "FORM": 11052, + "plet": 11053, + "Ġidentified": 11054, + "cpp": 11055, + "206": 11056, + "225": 11057, + "Ġcoding": 11058, + "oked": 11059, + "ĠMaster": 11060, + "IDTH": 11061, + "Ġresidents": 11062, + "redit": 11063, + "ĠPhoto": 11064, + "=-": 11065, + "unte": 11066, + "ateur": 11067, + "159": 11068, + "_STATE": 11069, + "ĠSing": 11070, + "Ġsheet": 11071, + ".val": 11072, + "orse": 11073, + "Ġhers": 11074, + "Ġdetermined": 11075, + "Common": 11076, + "Ġwed": 11077, + "_queue": 11078, + "PH": 11079, + "ĠAtl": 11080, + "cred": 11081, + "/LICENSE": 11082, + "Ġmes": 11083, + "Ġadvanced": 11084, + ".java": 11085, + ".Sh": 11086, + "Go": 11087, + "kill": 11088, + "fp": 11089, + "_settings": 11090, + "Ġpal": 11091, + "Ġtruck": 11092, + "Ġcombined": 11093, + "Ġ\"${": 11094, + "ĠCorpor": 11095, + "Ġjoined": 11096, + "ĠJose": 11097, + "ĠCup": 11098, + "uns": 11099, + "estival": 11100, + "levision": 11101, + "Ġbroken": 11102, + "Ġmarriage": 11103, + "ĠWestern": 11104, + "Ġrepresents": 11105, + "ĠTitle": 11106, + "Ġss": 11107, + ".Ass": 11108, + "ongoose": 11109, + "iento": 11110, + "<>();Ċ": 11111, + "Ġabsolutely": 11112, + "Ġsmooth": 11113, + "TERN": 11114, + "ĠUnless": 11115, + "Word": 11116, + "Ġmerge": 11117, + "igan": 11118, + "ĠVol": 11119, + "Ġnn": 11120, + ".getId": 11121, + "Ġз": 11122, + "171": 11123, + "Ġsexy": 11124, + "Ġseeking": 11125, + "Single": 11126, + ".this": 11127, + "179": 11128, + "Ġkom": 11129, + "bound": 11130, + ";\"": 11131, + "ĠfontSize": 11132, + "_df": 11133, + "Ġinjury": 11134, + "(H": 11135, + "Ġissued": 11136, + "_END": 11137, + ":self": 11138, + "020": 11139, + "Ġpatch": 11140, + "Ġleaves": 11141, + "Ġadopt": 11142, + "FileName": 11143, + "ãĢIJ": 11144, + "Ġexecutive": 11145, + "ĠByte": 11146, + "]))Ċ": 11147, + "Ġnu": 11148, + "outing": 11149, + "cluding": 11150, + "-R": 11151, + ".options": 11152, + "Ġsubstant": 11153, + "avax": 11154, + "ĠBUT": 11155, + "Ġtechnical": 11156, + "Ġtwice": 11157, + "Ġmás": 11158, + "Ġunivers": 11159, + "yr": 11160, + "Ġdrag": 11161, + "ĠDC": 11162, + "Ġsed": 11163, + "Ġbot": 11164, + "ĠPal": 11165, + "ĠHall": 11166, + "forcement": 11167, + "Ġauch": 11168, + ".mod": 11169, + "notation": 11170, + "_files": 11171, + ".line": 11172, + "_flag": 11173, + "[name": 11174, + "Ġresolution": 11175, + "Ġbott": 11176, + "(\"[": 11177, + "ende": 11178, + "(arr": 11179, + "Free": 11180, + "(@\"": 11181, + "ĠDistrict": 11182, + "PEC": 11183, + ":-": 11184, + "Picker": 11185, + "ĠJo": 11186, + "ĠĠĠĠĠĊ": 11187, + "ĠRiver": 11188, + "_rows": 11189, + "Ġhelpful": 11190, + "Ġmassive": 11191, + "---Ċ": 11192, + "Ġmeasures": 11193, + "007": 11194, + "ĠRuntime": 11195, + "Ġworry": 11196, + "ĠSpec": 11197, + "ĉD": 11198, + "ãĢij": 11199, + "Ġ){Ċ": 11200, + "Ġworse": 11201, + "(filename": 11202, + "Ġlay": 11203, + "Ġmagic": 11204, + "ĠTheir": 11205, + "oul": 11206, + "stroy": 11207, + "ĠWhere": 11208, + "280": 11209, + "Ġsudden": 11210, + "Ġdefe": 11211, + "Ġbinding": 11212, + "Ġflight": 11213, + "ĠOnInit": 11214, + "ĠWomen": 11215, + "ĠPolicy": 11216, + "Ġdrugs": 11217, + "ishing": 11218, + "('../": 11219, + "ĠMel": 11220, + "peat": 11221, + "tor": 11222, + "Ġproposed": 11223, + "Ġstated": 11224, + "_RES": 11225, + "Ġeast": 11226, + "212": 11227, + "ĠCONDITION": 11228, + "_desc": 11229, + "Ġwinning": 11230, + "folio": 11231, + "Mapper": 11232, + "ĠPan": 11233, + "ĠAnge": 11234, + ".servlet": 11235, + "Ġcopies": 11236, + "LM": 11237, + "Ġvm": 11238, + "åį": 11239, + "Ġdictionary": 11240, + "Seg": 11241, + "177": 11242, + "elines": 11243, + "ĠSend": 11244, + "Ġiron": 11245, + "ĠFort": 11246, + "166": 11247, + ".domain": 11248, + "Ġdebate": 11249, + "NotNull": 11250, + "eq": 11251, + "acher": 11252, + "lf": 11253, + "ĉfmt": 11254, + "Ġlawy": 11255, + "178": 11256, + "ÄŁ": 11257, + "ĠMen": 11258, + "Ġtrim": 11259, + "(NULL": 11260, + "Ġ!!": 11261, + "Ġpad": 11262, + "Ġfollows": 11263, + "\"][\"": 11264, + "requ": 11265, + "ĠEp": 11266, + ".github": 11267, + "(img": 11268, + "eto": 11269, + "('\\": 11270, + "Services": 11271, + "umbnail": 11272, + "_main": 11273, + "pleted": 11274, + "fortunately": 11275, + "Ġwindows": 11276, + "Ġplane": 11277, + "ĠConnection": 11278, + ".local": 11279, + "uard": 11280, + "}\\": 11281, + "==\"": 11282, + "andon": 11283, + "ĠRoy": 11284, + "west": 11285, + "158": 11286, + "iginal": 11287, + "emies": 11288, + "itz": 11289, + "'):Ċ": 11290, + "ĠPeter": 11291, + "Ġtough": 11292, + "Ġreduced": 11293, + "Ġcalculate": 11294, + "Ġrapid": 11295, + "customer": 11296, + "Ġefficient": 11297, + "Ġmedium": 11298, + "Ġfell": 11299, + ".ref": 11300, + "ĠCas": 11301, + "Ġfeedback": 11302, + "Speed": 11303, + "(output": 11304, + "aje": 11305, + "Ġcategories": 11306, + "Ġfee": 11307, + "};": 11308, + "Ġdeleted": 11309, + "reh": 11310, + "Ġproof": 11311, + "Desc": 11312, + "Build": 11313, + "Ġsides": 11314, + ".ArrayList": 11315, + "-%": 11316, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 11317, + "ر": 11318, + ".match": 11319, + "ли": 11320, + "Ġfeels": 11321, + "Ġachieve": 11322, + "Ġclim": 11323, + "_ON": 11324, + "ĠCD": 11325, + "Ġteacher": 11326, + "_current": 11327, + "bn": 11328, + "_PL": 11329, + "isting": 11330, + "Enable": 11331, + "GEN": 11332, + "Ġtv": 11333, + "Ġsock": 11334, + "Ġplays": 11335, + "Ġdiscount": 11336, + "ĠKE": 11337, + "ĠDebug": 11338, + "Fore": 11339, + "ĠIraq": 11340, + "Ġappearance": 11341, + "Mon": 11342, + "Ġstyled": 11343, + "ĠHuman": 11344, + "iot": 11345, + "ĠHistory": 11346, + "Ġsac": 11347, + "ĠCollection": 11348, + "Ġrecommended": 11349, + ".Selected": 11350, + "Ġorganizations": 11351, + "Ġdiscovered": 11352, + "cohol": 11353, + "adas": 11354, + "ĠThomas": 11355, + "May": 11356, + "Ġconserv": 11357, + "Ġdomin": 11358, + "ĠFollow": 11359, + "ĠSection": 11360, + "ĠThanks": 11361, + "Username": 11362, + "Ġrecipe": 11363, + "Ġwonderful": 11364, + ".sleep": 11365, + "_if": 11366, + "ĉĊĉĊ": 11367, + "orno": 11368, + "Ġru": 11369, + "_target": 11370, + ".\"\"": 11371, + "à¦": 11372, + "EventArgs": 11373, + "Ġinputs": 11374, + "Ġfif": 11375, + "Ġvision": 11376, + "cy": 11377, + "ĠSeries": 11378, + ")(((": 11379, + "Ġtrading": 11380, + "Ġmarker": 11381, + "Begin": 11382, + "Ġtypically": 11383, + "Ġcauses": 11384, + "dropdown": 11385, + "_DEBUG": 11386, + "260": 11387, + "Ġdetect": 11388, + "country": 11389, + "!\");Ċ": 11390, + "ĉR": 11391, + "appy": 11392, + "Ġcref": 11393, + "('<": 11394, + "\"=>": 11395, + "ĠLE": 11396, + "reader": 11397, + "Ġadministr": 11398, + "õ": 11399, + "ucket": 11400, + "Ġfashion": 11401, + ".char": 11402, + "izar": 11403, + "Ġdisable": 11404, + "Ġsuc": 11405, + "ĠLive": 11406, + "issue": 11407, + "Ġmetadata": 11408, + "flags": 11409, + "ĠðŁ": 11410, + "Ġcommitted": 11411, + "Ġva": 11412, + "Ġrough": 11413, + "Ġ'''Ċ": 11414, + "Ġhighlight": 11415, + "_vars": 11416, + "VO": 11417, + "Ġencoding": 11418, + "-Z": 11419, + "_sign": 11420, + "$(\"#": 11421, + "Ġrain": 11422, + "reatest": 11423, + "ĠEND": 11424, + "Selection": 11425, + "Ġcandidates": 11426, + "Ġsav": 11427, + ".Empty": 11428, + "Ġdecisions": 11429, + "Ġcollabor": 11430, + "ridge": 11431, + "feed": 11432, + "ression": 11433, + "Ġpersons": 11434, + "VM": 11435, + "008": 11436, + "ega": 11437, + "_BIT": 11438, + "According": 11439, + "acked": 11440, + "Ġdollars": 11441, + "_loss": 11442, + "ĠCost": 11443, + "}\"Ċ": 11444, + "Notification": 11445, + "Ġprostit": 11446, + "Ġauthority": 11447, + ".rec": 11448, + "Ġspokes": 11449, + "ĠToday": 11450, + "istant": 11451, + "ĠHead": 11452, + "âĢĿ.": 11453, + "ertainment": 11454, + "cean": 11455, + "culate": 11456, + "Ġven": 11457, + "However": 11458, + "_arr": 11459, + "Ġtokens": 11460, + "Graph": 11461, + "ĠJud": 11462, + "ĠVirgin": 11463, + "ĠSerial": 11464, + "unning": 11465, + "Mutable": 11466, + "agers": 11467, + ".csv": 11468, + "Ġdeveloping": 11469, + "Ġinstructions": 11470, + "Ġpromise": 11471, + "Ġrequested": 11472, + "_encode": 11473, + "/\"": 11474, + "ĠIcon": 11475, + "uilt": 11476, + "-day": 11477, + "Ġintelligence": 11478, + ".IS": 11479, + "ĠObservable": 11480, + "ĠHard": 11481, + "Bool": 11482, + "211": 11483, + "idential": 11484, + ".Anchor": 11485, + "Ġselling": 11486, + "CI": 11487, + "AGES": 11488, + "tle": 11489, + "bur": 11490, + "UFFER": 11491, + "RY": 11492, + "Ġbigger": 11493, + "Ġrat": 11494, + "Ġfamous": 11495, + "Ġtypename": 11496, + "Ġexplained": 11497, + "}}Ċ": 11498, + "Ġnuclear": 11499, + "-N": 11500, + "Ġcrisis": 11501, + "ĠEnter": 11502, + "Ġanswers": 11503, + "/${": 11504, + "/pl": 11505, + "Ġsequ": 11506, + "_next": 11507, + "mask": 11508, + "Ġstanding": 11509, + "Ġplenty": 11510, + "ĠCross": 11511, + "ĉret": 11512, + "dro": 11513, + "ĠCast": 11514, + "167": 11515, + "=true": 11516, + "ĠChris": 11517, + "icio": 11518, + "ĠMike": 11519, + "Decimal": 11520, + "addComponent": 11521, + "Len": 11522, + "Ġcock": 11523, + "Ġ#{": 11524, + "URN": 11525, + "": 11657, + "Ġ*=": 11658, + "ĠPS": 11659, + "Ġdangerous": 11660, + "[p": 11661, + "OME": 11662, + "Other": 11663, + "ĠStringBuilder": 11664, + "Points": 11665, + "heading": 11666, + "Ġcurrency": 11667, + "Ġpercentage": 11668, + "_API": 11669, + "Ġclassic": 11670, + "thead": 11671, + "ĠMO": 11672, + "FE": 11673, + "Idx": 11674, + "await": 11675, + "Ġè": 11676, + "Ġaccident": 11677, + "Ġvariant": 11678, + "Ġmyst": 11679, + "ĠLand": 11680, + "ĠBre": 11681, + "Ġharm": 11682, + "ĠAcc": 11683, + "Ġcharged": 11684, + "iones": 11685, + "Visibility": 11686, + "arry": 11687, + "ĠLanguage": 11688, + "Ġwalking": 11689, + "\".ĊĊ": 11690, + "ifer": 11691, + "Ġleadership": 11692, + ".From": 11693, + "ynam": 11694, + "Ġtimestamp": 11695, + "ipt": 11696, + "ĠHas": 11697, + "REFER": 11698, + "ĠIts": 11699, + "Ġlistener": 11700, + "UTE": 11701, + "213": 11702, + "_description": 11703, + "Ġexperiences": 11704, + "Ġcreates": 11705, + "RS": 11706, + "cart": 11707, + "black": 11708, + "Ġchoices": 11709, + "war": 11710, + "750": 11711, + "Ġ'''": 11712, + "Ġordered": 11713, + "Ġevening": 11714, + "Ġpil": 11715, + "Ġtun": 11716, + "ĠBad": 11717, + "(app": 11718, + "random": 11719, + "Ġexplicit": 11720, + "Ġarrived": 11721, + "Ġfly": 11722, + "Ġeconom": 11723, + "-mail": 11724, + "Ġlists": 11725, + "Ġarchitect": 11726, + "234": 11727, + "ĠPay": 11728, + "Ġds": 11729, + "ĠSol": 11730, + "Ġvehicles": 11731, + "Hz": 11732, + "-com": 11733, + "Ġking": 11734, + "_equal": 11735, + "ĠHelp": 11736, + "Ġabuse": 11737, + "480": 11738, + "169": 11739, + "--;Ċ": 11740, + "Ġextr": 11741, + "Ġchemical": 11742, + "ä¿": 11743, + "Ġorient": 11744, + "Ġbreath": 11745, + "ĠSpace": 11746, + "(element": 11747, + "wait": 11748, + "DED": 11749, + "igma": 11750, + "Ġentr": 11751, + "Ġsob": 11752, + "-name": 11753, + "Ġaffected": 11754, + "ika": 11755, + "Ġcoal": 11756, + "_work": 11757, + "Ġhundreds": 11758, + "Ġpolitics": 11759, + "subject": 11760, + "Ġconsumer": 11761, + "ANGE": 11762, + "Ġrepeated": 11763, + "Send": 11764, + "Ġ#[": 11765, + "Ġprotocol": 11766, + "Ġleads": 11767, + "useum": 11768, + "Every": 11769, + "808": 11770, + "174": 11771, + "Import": 11772, + "(count": 11773, + "Ġchallenges": 11774, + "Ġnovel": 11775, + "Ġdepart": 11776, + "bits": 11777, + ".Current": 11778, + "Ġ`${": 11779, + "oting": 11780, + "(\\": 11781, + "Ġcreative": 11782, + "Ġbuff": 11783, + "Ġintroduced": 11784, + "usic": 11785, + "modules": 11786, + "Are": 11787, + "-doc": 11788, + "language": 11789, + "_cache": 11790, + "Ġtod": 11791, + "?>": 11792, + "omething": 11793, + "Ġhun": 11794, + "åº": 11795, + "aters": 11796, + "Intent": 11797, + "Ġimplemented": 11798, + "ĠCase": 11799, + "Children": 11800, + "Ġnotification": 11801, + "Renderer": 11802, + "Wrapper": 11803, + "Objects": 11804, + "tl": 11805, + ".Contains": 11806, + "Plugin": 11807, + ".row": 11808, + "Ġforg": 11809, + "Ġpermit": 11810, + "Ġtargets": 11811, + "ĠIF": 11812, + "Ġtip": 11813, + "sex": 11814, + "Ġsupports": 11815, + "Ġfold": 11816, + "photo": 11817, + "},čĊ": 11818, + "Ġgoogle": 11819, + "$('#": 11820, + "Ġsharing": 11821, + "Ġgoods": 11822, + "vs": 11823, + "ĠDan": 11824, + "Rate": 11825, + "ĠMartin": 11826, + "Ġmanner": 11827, + "lie": 11828, + ".The": 11829, + "Internal": 11830, + "ĠCONTR": 11831, + "Mock": 11832, + "RIGHT": 11833, + "Ġ'{": 11834, + "Ġcontrols": 11835, + "Mat": 11836, + "Ġmand": 11837, + "Ġextended": 11838, + "Ok": 11839, + "Ġembed": 11840, + "Ġplanet": 11841, + "ĠNon": 11842, + "-ch": 11843, + ")\",": 11844, + "epar": 11845, + "Ġbelieved": 11846, + "ĠEnvironment": 11847, + "ĠFriend": 11848, + "-res": 11849, + "Ġhandling": 11850, + "nic": 11851, + "-level": 11852, + "scri": 11853, + "Xml": 11854, + "BE": 11855, + "ungen": 11856, + "Ġalter": 11857, + "[idx": 11858, + "Pop": 11859, + "cam": 11860, + "Ġ(((": 11861, + "Ġshipping": 11862, + "Ġbattery": 11863, + "iddleware": 11864, + "MC": 11865, + "Ġimpl": 11866, + "otation": 11867, + "ĠLab": 11868, + "